diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..440c6f9 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,321 @@ +#################################################################################### +# START 1. Basic setup for cmake +#################################################################################### +# basic setup for cmake +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +if(POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON) +set(CMAKE_COLOR_MAKEFILE ON) +set(CMAKE_CXX_STANDARD_REQUIRED True) +# Disable gnu exentions +set(CMAKE_CXX_EXTENSIONS ON) + +# Define the project +project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES C CXX) + +# For GCC 8 and lower, set -pthread flag manually +set(CMAKE_C_FLAGS "-pthread") +set(CMAKE_CXX_FLAGS "-pthread") + +# add a directory for cmake modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +# DSA_XENGINE may be built to run using CUDA or CPU. Future version may be +# written for HIP or SYCL, which we call the +# Target type. By default, the target is CUDA. +#--------------------------------------------- + +# Set by environment variable if visible +if(DEFINED ENV{DSA_XENGINE_TARGET}) + set(DEFTARGET $ENV{DSA_XENGINE_TARGET}) +else() + set(DEFTARGET "CUDA") +endif() + +set(VALID_TARGET_TYPES CUDA CPU) #HIP SYCL +set(DSA_XENGINE_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") +set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS "CUDA" "CPU") # HIP SYCL + +string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE) +list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) + +if(TARGET_TYPE_VALID LESS 0) + message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") +endif() + +# Git helpers +#------------ +find_package(Git) +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} show + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + RESULT_VARIABLE IS_GIT_REPOSIITORY + OUTPUT_QUIET ERROR_QUIET) + if(${IS_GIT_REPOSIITORY} EQUAL 0) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --abbrev=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITTAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but + # that might not always be available + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMAND wc -l + OUTPUT_VARIABLE GITCOUNT + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --match 1 --always --long --dirty + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITVERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +endif(GIT_FOUND) + + +option(DSA_XENGINE_BUILD_ALL_TESTS "build tests by default" ON) +option(DSA_XENGINE_INSTALL_ALL_TESTS "install tests by default" ON) +option(DSA_XENGINE_BUILD_SHAREDLIB "build dsaXengine as a shared lib" ON) + + +# Use ExternalProject_Add for libtcc (borks with FetchContent) +# Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch) +include(ExternalProject) + +# Use FetchContent for lightweight dependencies +include(FetchContent) + +# CUDA based dependencies and options +#------------------------------------ +if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) + + # CUDA specific part of CMakeLists + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + # Get GPU architecture from environmen, or set default (sm_80) + if(DEFINED ENV{DSA_XENGINE_GPU_ARCH}) + set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH}) + else() + set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_80) + endif() + + if(NOT DSA_XENGINE_GPU_ARCH) + message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}") + endif() + + set(DSA_XENGINE_GPU_ARCH + ${DSA_XENGINE_DEFAULT_GPU_ARCH} + CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)") + set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90) + set(DSA_XENGINE_GPU_ARCH_SUFFIX + "" + CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") + set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ") + #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH}) + #mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) + #mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) + + # Set CUDA based methods and dependencies + #---------------------------------------- + + # This is the default GPU method + option(DSA_XENGINE_ENABLE_CUBLAS "Use cuBLAS for correlatorss" ON) + + # All other GPU methods can be enabled at compile time and + # toggled for use at run time, if enabled. + + # Get TCC dependency + option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF) + if(DSA_XENGINE_ENABLE_TCC) + add_compile_definitions(DSA_XENGINE_ENABLE_TCC) + option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF) + if(DSA_XENGINE_DOWNLOAD_TCC) + ExternalProject_Add(TCC + GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator + #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c + CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(libtcc REQUIRED) + endif() + endif() + + # Get CUTLASS dependency + option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF) + if(DSA_XENGINE_ENABLE_CUTLASS) + add_compile_definitions(DSA_XENGINE_ENABLE_CUTLASS) + option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF) + if(DSA_XENGINE_DOWNLOAD_CUTLASS) + # Custom CUTLASS build + ExternalProject_Add(NvidiaCutlass + GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git + GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc + CMAKE_ARGS + "-DCUTLASS_NVCC_ARCHS_ENABLED=89" + "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(NvidiaCutlass REQUIRED) + endif() + endif() + + # Get MAGMA dependency + option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF) + if(DSA_XENGINE_ENABLE_MAGMA) + add_compile_definitions(DSA_XENGINE_ENABLE_MAGMA) + option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF) + if(DSA_XENGINE_DOWNLOAD_MAGMA) + # Custom MAGMA build + ExternalProject_Add(Magma + URL https://icl.utk.edu/projectsfiles/magma/downloads/magma-2.8.0.tar.gz + CMAKE_ARGS + "-DMAGMA_ENABLE_CUDA=ON" + "-DGPU_TARGET=sm_80" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(Magma REQUIRED) + endif() + endif() + + # Get XGPU dependency (fix install) + option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF) + if(DSA_XENGINE_ENABLE_XGPU) + add_compile_definitions(DSA_XENGINE_ENABLE_XGPU) + option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF) + if(DSA_XENGINE_DOWNLOAD_XGPU) + # Download, build and install + FetchContent_Declare( + xGPU + GIT_REPOSITORY https://github.com/cpviolator/xGPU.git + #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 + ) + FetchContent_MakeAvailable(XGPU) + else() + # Find and link to local install + find_package(xGPU REQUIRED) + endif() + endif() + +endif() # CUDA functionality + +# Get CPU based dependencies +# Get OPENBLAS dependency +option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF) +if(DSA_XENGINE_ENABLE_OPENBLAS) + add_compile_definitions(DSA_XENGINE_ENABLE_OPENBLAS) + option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF) + if(DSA_XENGINE_DOWNLOAD_OPENBLAS) + # Custom OPENBLAS build + ExternalProject_Add(Openblas + GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git + GIT_TAG ce3f668 + CMAKE_ARGS + #"-DOPENBLAS_ENABLE_CUDA=ON" + #"-DGPU_TARGET=sm_80" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(Openblas REQUIRED) + endif() +endif() + +# Get psrdada dependency +option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for IO" ON) +option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON) +if(DSA_XENGINE_DOWNLOAD_PSRDADA) + # Download, build and install + FetchContent_Declare( + PSRDada + GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code + ) + FetchContent_MakeAvailable(PSRDada) +else() + # Find and link to local install + find_package(PSRDada REQUIRED) +endif() + +# Get HDF5 dependency +option(DSA_XENGINE_ENABLE_HDF5 "Use HDF5 for data IO" OFF) +if(DSA_XENGINE_ENABLE_HDF5) + option(DSA_XENGINE_DOWNLOAD_HDF5 "Download and build HDf5" OFF) + if(DSA_XENGINE_DOWNLOAD_HDF5) + # Download, build and install + FetchContent_Declare( + HDF5 + GIT_REPOSITORY https://github.com/HDFGroup/hdf5.git + GIT_TAG 5794814 + ) + FetchContent_MakeAvailable(HDF5) + else() + # Find and link to local install + find_package(HDF5 REQUIRED) + endif() +endif() + +# Get CLI11 dependency +# FIX ME: get static .hpp version and ship with package +option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON) +if(DSA_XENGINE_ENABLE_CLI11) + option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON) + if(DSA_XENGINE_DOWNLOAD_CLI11) + # Download, build and install + FetchContent_Declare( + CLI11 + GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git + GIT_TAG main + ) + FetchContent_MakeAvailable(CLI11) + else() + # Find and link to local install + find_package(CLI11 REQUIRED) + endif() +endif() + + +# Get ZFP dependency +option(DSA_XENGINE_ENABLE_ZFP "Enable ZFP" OFF) +if(DSA_XENGINE_ENABLE_ZFP) + option(DSA_XENGINE_DOWNLOAD_ZFP "Download and build ZFP" OFF) + if(DSA_XENGINE_DOWNLOAD_ZFP) + # Download, build and install + FetchContent_Declare( + ZFP + GIT_REPOSITORY https://github.com/LLNL/zfp.git + GIT_TAG f40868a + ) + FetchContent_MakeAvailable(ZFP) + else() + # Find and link to local install + find_package(ZFP REQUIRED) + endif() +endif() + +# Add src, include, tests, and legacy +add_subdirectory(src) +add_subdirectory(include) +add_subdirectory(tests) +option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF) +if(DSA_XENGINE_BUILD_LEGACY) + add_subdirectory(legacy) +endif() + +# Install project cmake targets +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${PROJECT_NAME}-config-version.cmake + VERSION ${DSA_XENGINE_VERSION} + COMPATIBILITY AnyNewerVersion + ) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} + ) diff --git a/README.md b/README.md index 03fe5e3..f771017 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # dsa110-xengine +

+ GitHub last commit + GitHub commit activity the past week +

This repo contains code used for the DSA X-engine. The requirements are to: - capture SNAP F-engine packets on an ethernet interface, and place them in a psrdada buffer @@ -67,11 +71,4 @@ Finally, `dsaX_dbnic` and `dsaX_nicdb` implement the corner turn to feed `mbheim ### scripts and utils -The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. - - - - - - - +The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt new file mode 100644 index 0000000..9a7cbbd --- /dev/null +++ b/include/CMakeLists.txt @@ -0,0 +1,21 @@ +enable_language(CUDA) + +# install step for header files +#------------------------------ +set(DSA_XENGINE_HEADERS + # cmake-format: sortable + dsaX.h + dsaX_def.h + dsaX_malloc.h + dsaX_ptr.h + fast_time_domain.h + cuda_interface.h + cuda_handles.h + cuda_headers.h + dsaX_capture.h + dsaX_capture_manythread.h + dsaX_capture_pcap.h + cutlass_interface.h + ) +install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) +#------------------------------ diff --git a/include/blas_interface.h b/include/blas_interface.h new file mode 100644 index 0000000..d643e08 --- /dev/null +++ b/include/blas_interface.h @@ -0,0 +1,5 @@ +#pragma once + +#include "interface.h" + +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0); diff --git a/include/cublas_interface.h b/include/cublas_interface.h new file mode 100644 index 0000000..f68eea3 --- /dev/null +++ b/include/cublas_interface.h @@ -0,0 +1,4 @@ +#pragma once +#include "dsaX.h" + +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream); diff --git a/include/cuda_handles.h b/include/cuda_handles.h new file mode 100644 index 0000000..eeaf706 --- /dev/null +++ b/include/cuda_handles.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include "utils.h" + +#ifdef DSA_XENGINE_TARGET_CUDA +#include "cuda_headers.h" + +static std::vector streams; +static cublasHandle_t cublasH = NULL; + +static bool cublas_init = false; +static bool stream_init = false; + +cudaStream_t get_stream(unsigned int i); +#endif + +void init_streams(unsigned int n_streams); +void destroy_streams(); diff --git a/include/cuda_headers.h b/include/cuda_headers.h new file mode 100644 index 0000000..333a5bc --- /dev/null +++ b/include/cuda_headers.h @@ -0,0 +1,8 @@ +#pragma once + +#if defined (DSA_XENGINE_TARGET_CUDA) +#include +#include "cuda_fp16.h" +#include +#include +#endif diff --git a/include/cuda_interface.h b/include/cuda_interface.h new file mode 100644 index 0000000..42043e2 --- /dev/null +++ b/include/cuda_interface.h @@ -0,0 +1,52 @@ +#pragma once + +#include + +#include "dsaX_def.h" +#include "enums.h" +#include "dsaX.h" + +void dsaXInitCuda(int dev); +void dsaXDestroyCuda(); + +void initBLASCuda(); +void destroyBLASCuda(); + +void initStreamsCuda(unsigned int n); +void destroyStreamsCuda(); + +void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream); + +void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams); + +void initializeBFCudaMemory(bf_handle *d); + +void deallocateCorrCudaMemory(corr_handle *d); + +void deallocateBFCudaMemory(bf_handle *d); + +void dsaXmemsetCuda(void *array, int ch, size_t n); + +void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind, int stream); + +void *dsaXHostRegisterCuda(size_t size); + +void dsaXDeviceSynchronizeCuda(); + +void reorderCorrOutputCuda(corr_handle *d, int stream); + +void computeIndicesCuda(corr_handle *d); + +void reorderCorrInputCuda(corr_handle *d, int stream); + +void calcWeightsCuda(bf_handle *d); + +template void transposeMatrixCuda(in_prec *idata, out_prec *odata); + +void transposeInputBeamformerCuda(double *idata, double *odata, std::vector &dim_block_in, std::vector &dim_grid_in); + +void transposeScaleBeamformerCuda(void *real, void *imag, unsigned char *output, std::vector &dim_block_in, std::vector &dim_grid_in); + +void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb); + +void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb); diff --git a/include/cuda_kernels.h b/include/cuda_kernels.h new file mode 100644 index 0000000..d57a11b --- /dev/null +++ b/include/cuda_kernels.h @@ -0,0 +1,340 @@ +#pragma once + +#include "cuda_headers.h" + +__global__ void inspectPackedDataInKernel(char input, int i) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(re != 0 || im != 0) printf("K val[%d] = (%f,%f)\n", i, re, im); +} + +// KERNELS +// DMH: Abstract hardcoded launch parameters +__global__ void transpose_input_beamformer(double *idata, double *odata) { + + __shared__ double tile[16][17][4]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + + for (int j = 0; j < 16; j += 8) { + tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)]; + tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1]; + tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2]; + tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3]; + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) { + odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0]; + odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1]; + odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2]; + odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3]; + } +} + +// kernel to help with reordering output +// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] +// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads +__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { + + int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 + int tidx = threadIdx.x; // assume 128 + int idx = blockDim.x * bidx + tidx; + + int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); + int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); + int ch = (int)(chpol / 2); + int base_idx = indices_lookup[baseline]; + int iidx = base_idx * NCHAN_PER_PACKET + ch; + int pol = (int)(chpol % 2); + + float v1=0., v2=0.; + + //if(idx<1) printf("output pre (%f, %f)\n", output[2*idx], output[2*idx+1]); + + // Use CUDA casting intrinsic __half2float + for (int i=0;i __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { + + __shared__ in_prec tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x); + } +} + +// transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_float(half * idata, half * odata) { + + __shared__ float tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //printf("K transpose_matrix_float_in[%d] = %f\n", (y+j)*width + x, __half2float(idata[(y+j)*width + x])); + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //printf("K transpose_matrix_float_out[%d] = %f\n", (y+j)*width + x, __half2float(odata[(y+j)*width + x])); + } +} + + +// DMH: TUNABLE +// transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_char(char * idata, char * odata) { + + __shared__ char tile[32][33]; + //extern __shared__ char tile[]; + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.x + threadIdx.y; + int width = gridDim.x * blockDim.x; + + for (int j = 0; j < blockDim.x; j += blockDim.y) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //tile[(threadIdx.y+j)*blockDim.x + threadIdx.x] = idata[(y+j)*width + x]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + + __syncthreads(); + + x = blockIdx.y * blockDim.x + threadIdx.x; // transpose block offset + y = blockIdx.x * blockDim.x + threadIdx.y; + width = gridDim.y * blockDim.x; + + for (int j = 0; j < blockDim.x; j += blockDim.y) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } +} + + +/** + * Promote complex char riri... data to planar half rr.. ii.. + * + * @param[out] inr Half precision real array + * @param[out] ini Half precision imag array + * @param[in] input Char precision complex array + */ +__global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int iidx = blockDim.x * bidx + tidx; + + // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr + // to get real part 4 bit data. + // 0000rrrr + // Bit shift this result by 4 to the left. + // rrrr0000 + // Cast to signed char. + // +-rrr0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000rrr + // Cast to float and use CUDA intrinsic to cast to signed half + inr[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(15) ) << 4) >> 4)); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); + + //good + //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx])); +} + +// kernel to populate an instance of weights matrix +// [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] +// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads +// TUNABLE +__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { + + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int inidx = 128 * bidx + tidx; + + // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) + + // get indices + int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2))); + int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); + int bm = (int)(idx / (128*(NANTS/2))); + int tactp = (int)(idx % (128*(NANTS/2))); + //int t = (int)(tactp / (32*(NANTS/2))); + int actp = (int)(tactp % (32*(NANTS/2))); + int a = (int)(actp / 32); + int ctp = (int)(actp % 32); + //int c = (int)(ctp / 4); + int tp = (int)(ctp % 4); + //int t2 = (int)(tp / 2); + int pol = (int)(tp % 2); + int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; + + // calculate weights + float theta, afac, twr, twi; + if (iArm==0) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_e[a+48*iArm]); + twi = sin(afac*antpos_e[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } + if (iArm==1) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_n[a+48*iArm]); + twi = sin(afac*antpos_n[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } +} + +// kernel to fluff input bf data +// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads +__global__ void fluff_input_beamformer(char * input, half * dr, half * di) { + + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int idx = blockDim.x * bidx + tidx; + + dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); + di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + + // Both results should be half (FP16) integers between -8 and 7. + //half re = dr[idx]; + //half im = di[idx]; + //half lim = 0; + //if( (re > lim || re < -lim) || (im > lim || im < -lim)) { + //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); + //} +} + +// transpose, add and scale kernel for bf +// assume breakdown into tiles of 16x16, and run with 16x8 threads per block +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) +// scf is a per-beam scale factor to enable recasting as unsigned char +__global__ void transpose_scale_beamformer(half * ir, half * ii, unsigned char * odata) { + + __shared__ float tile[16][17]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + float dr, di; + + for (int j = 0; j < 16; j += 8) { + dr = (float)(ir[(y+j)*width + x]); + di = (float)(ii[(y+j)*width + x]); + tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) + odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + +} + +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +__global__ void sum_beam(unsigned char *input, float *output) { + + __shared__ float summ[512]; + int bidx = blockIdx.x; + int tidx = threadIdx.x; + //int idx = bidx*256+tidx; + int bm = (int)(bidx/48); + int ch = (int)(bidx % 48); + + summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); + + __syncthreads(); + + if (tidx<256) { + summ[tidx] += summ[tidx+256]; + summ[tidx] += summ[tidx+128]; + summ[tidx] += summ[tidx+64]; + summ[tidx] += summ[tidx+32]; + summ[tidx] += summ[tidx+16]; + summ[tidx] += summ[tidx+8]; + summ[tidx] += summ[tidx+4]; + summ[tidx] += summ[tidx+2]; + summ[tidx] += summ[tidx+1]; + } + + if (tidx==0) output[bidx] = summ[tidx]; +} diff --git a/include/cutlass_interface.h b/include/cutlass_interface.h new file mode 100644 index 0000000..f95eeaa --- /dev/null +++ b/include/cutlass_interface.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/library/handle.h" + +using namespace cutlass; +using namespace gemm; +using namespace library; +using namespace layout; +using namespace reference; +using namespace device; + +// Result structure +struct Result { + + double runtime_ms; + double gflops; + Status status; + cudaError_t error; + bool passed; + + Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +// Command line options parsing (testing) +struct Options { + + bool help; + GemmCoord problem_size; + int batch_count; + complex alpha; + complex beta; + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(256), + reference_check(false), + iterations(2), + alpha(1), + beta(0) { } + + // Parses the command line + void parse(int argc, char const **args) { + + CommandLine cmd(argc, args); + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "dsaX_cutlass_interface\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/// Performance test environment for planar complex +class DSA_FTD_ComplexGEMM_CUTLASS { + + // Half-precision input and output + using Element = half_t; + + // Configurations for layouts and internal computation + using LayoutA = ColumnMajor; + using LayoutB = ColumnMajor; + using LayoutC = ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + Handle handle; + + GemmCoord problem_size; + int batch_count; + DeviceAllocation tensor_A; + DeviceAllocation tensor_B; + DeviceAllocation tensor_C; + DeviceAllocation tensor_D; + DeviceAllocation tensor_D_ref; + + DeviceAllocation ptr_A_real; + DeviceAllocation ptr_A_imag; + DeviceAllocation ptr_B_real; + DeviceAllocation ptr_B_imag; + DeviceAllocation ptr_C_real; + DeviceAllocation ptr_C_imag; + DeviceAllocation ptr_D_real; + DeviceAllocation ptr_D_imag; + + Element *ptr_A; + Element *ptr_B; + Element *ptr_C; + Element *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + typename LayoutA::Stride::Index lda; + typename LayoutB::Stride::Index ldb; + typename LayoutC::Stride::Index ldc; + typename LayoutC::Stride::Index ldd; + + int64_t imag_stride_A; + int64_t imag_stride_B; + int64_t imag_stride_C; + int64_t imag_stride_D; + +public: + // Constructors + DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); + DSA_FTD_ComplexGEMM_CUTLASS(); + + // Methods + void initialize(); + Result run(Options const &options); + + bool testing; +}; + diff --git a/include/dsaX.h b/include/dsaX.h new file mode 100644 index 0000000..f370bc0 --- /dev/null +++ b/include/dsaX.h @@ -0,0 +1,50 @@ +#pragma once + +// Expose the use to compile time definitions, +// enums, parameters, and classes +#include "dsaX_def.h" +#include "enums.h" +#include "params.h" +#include "fast_time_domain.h" + +// Use manual transpose route +// Uncomment to try new pure cuBLAS +//#define OLD_BLAS + +/** + * Initialize the library. This function will initialise + * a device if using CUDA and any BLAS libraries that are + * enabled, such as cublas. + * @param[in] device_ordinal The GPU device to init + */ +void dsaXInit(int device_ordinal = -1); + +/** + * Finalize the library. This function will finalize + * a device if using CUDA and any BLAS libraries that are + * enabled, such as cublas. It will also dump any statistics + * collected, such as performance metrics. + */ +void dsaXEnd(); + +/** + * This function will allocate pinned device memory of the + * given size in bytes, and return a void pointer to that + * memory. The user may delete the memory safely in their + * application code. + * @param[in] size The byte size of pinned memory to be allocated + * by dsaX. + */ +void *dsaXHostRegister(size_t size); + +/** + * This function allows the user to inspect the (4b,4b) char sized + * complex data at byte address i on the host. If 'non-zero' is true + * then the complex element will print only if either the real + * or imaginary element is non-zero. Useful for checking if + * an array is populated. + * @param[in] input The (4b,4b) char input array + * @param[in] i The ith element of the array + * @param[in] non-zero If true, print only elements with non-zero values + */ +void inspectPackedData(char input, int i, bool non_zero = false); diff --git a/include/dsaX_api.h b/include/dsaX_api.h new file mode 100644 index 0000000..3767600 --- /dev/null +++ b/include/dsaX_api.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "enums.h" + +#define STRINGIFY__(x) #x +#define __STRINGIFY__(x) STRINGIFY__(x) + +/** + @brief Wrapper around cudaMemcpy or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy +*/ +void dsaXMemcpy_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const char *func, const char *file, + const char *line); + +/** + @brief Wrapper around cudaMemcpyAsync or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy + @param[in] stream Stream to issue copy +*/ +void dsaXMemcpyAsync_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const cudaStream_t &stream, + const char *func, const char *file, const char *line); + + +#define dsaXMemcpy(dst, src, count, kind) \ + ::dsaXMemcpy_(dst, src, count, kind, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__)) + +#define dsaXMemcpyAsync(dst, src, count, kind, stream) \ + ::dsaXMemcpyAsync_(dst, src, count, kind, stream, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__)) diff --git a/src/dsaX_capture.h b/include/dsaX_capture.h similarity index 100% rename from src/dsaX_capture.h rename to include/dsaX_capture.h diff --git a/src/dsaX_capture_manythread.h b/include/dsaX_capture_manythread.h similarity index 100% rename from src/dsaX_capture_manythread.h rename to include/dsaX_capture_manythread.h diff --git a/src/dsaX_capture_pcap.h b/include/dsaX_capture_pcap.h similarity index 100% rename from src/dsaX_capture_pcap.h rename to include/dsaX_capture_pcap.h diff --git a/include/dsaX_def.h b/include/dsaX_def.h new file mode 100644 index 0000000..5b3af78 --- /dev/null +++ b/include/dsaX_def.h @@ -0,0 +1,100 @@ +#pragma once + +// default dada block keys +#define TEST_BLOCK_KEY 0x0000aada // for capture program. +// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER +// 128*3*384*32*2*4=37748736 for 4 CHANG 1 SNAP 1 REORDER +// 128*3*384*32*2*8=75497472 for 1 CHANG 1 SNAP 8 REORDER +#define CAPTURE_BLOCK_KEY 0x0000dada // for capture program. +// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER +// 150994944 for doSnap +#define REORDER_BLOCK_KEY 0x0000eada // for reorder program. +// 589824 for doSnap +#define REORDER_BLOCK_KEY2 0x0000bada // for reorder program 2. +// 128*32*1536*16*2*2=402653184 1 REORDER +// 3221225472 for 8 REORDERS +#define XGPU_BLOCK_KEY 0x0000fada // for xgpu program. +// 136*1536*2*8=3342336 +#define COPY_BLOCK_KEY 0x0000dbda // for split off data +#define BF_BLOCK_KEY 0x0000dcda // for beamformed data +#define BF_BLOCK_KEY2 0x0000bcda // for beamformed data testing +#define CAPTURED_BLOCK_KEY 0x0000abda // for capture program. +#define BEAMCAPTURE_BLOCK_KEY 0x0000bbda // for capture bf program. + +// constants +#define PI 3.14159265359 +#define CVAC 299792458.0 + +// default number of XGPU ints +#define NCORRINTS 128 +#define NNATINTS 32 // native number of integrations +#define NREORDERS 1 // number of ints per reorder + +// size of xgpu output +// TODO +#define XGPU_SIZE 835584 // size of single output vector (post-GPU) +#define XGPU_IN_INC 1 // size of input increment +#define NBASE 4656 // nant*(nant+1)/2 +#define NPOL 2 +#define NCOMPLEX 2 // two reals per complex +#define NCHAN 1536 // regardless of NCHANG + +// default port for packet capture +#define CAPTURE_PORT 4011 + +// default UDP packet dims +#define UDP_HEADER 8 // size of header/sequence number +#define UDP_DATA 4608 // obs bytes per packet +#define UDP_PAYLOAD 4616 // header + datasize + +// number of channel groups to expect +#define NCHANG 1 + +// number of SNAPs to expect +#define NSNAPS 32 + +/* expect consecutive channel groups */ +#define CHOFF 1024 // offset in channels of first group + +// default control ports +#define CAPTURE_CONTROL_PORT 11223 +#define REORDER_CONTROL_PORT 11224 +#define XGPU_CONTROL_PORT 11225 +#define WRITEVIS_CONTROL_PORT 11226 +#define TRIGGER_CONTROL_PORT 11227 + +#define NPACKETS_PER_CALL 2048 +#define NPACKETS_PER_BLOCK 2048 +#define NPACKETS_INTS 2048 // number of packets per xgpu int +#define NPACKETS_PER_FIL 2 +#define NPACKETS 2048 +#define NOUTBLOCKS 15 // number of input blocks stored by trigger +#define NANTS 96 +#define NCHAN_PER_PACKET 384 +#define NBEAMS 512 + +// for beamformer +//#define sep 1.0 // arcmin +#define NW 48 // number of weights per 384 chans. Also the number of channels formed +#define NANT 63 +#define BEAM_OUT 23 +#define NSTREAMS 4 +#define NBP 8 // number of previous BPs to average + +// for second corner turn +#define FIL_PORT0 6625 // port for first chan group +#define NCLIENTS 16 // number of client dbnic processes to expect +#define NSAMPS_PER_BLOCK 16384 // number of samples per block +#define NCHAN_FIL 1024 // final number of filterband chans +#define NBEAMS_PER_BLOCK 64 // number of beams to expect +#define NSAMPS_PER_TRANSMIT 512 // number of samples transmitted at one time +#define NBMS 256 +#define P_SIZE 4108 +#define NWAIT 100000 + +// required to prevent overflow in corr matrix multiply +#define halfFac 4 + +// beam sep +#define sep 1.0 // arcmin + diff --git a/include/dsaX_malloc.h b/include/dsaX_malloc.h new file mode 100644 index 0000000..04d24b0 --- /dev/null +++ b/include/dsaX_malloc.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include // for getpagesize() +#include // for backtrace +#include // for std::map + +#include +#include + + +using namespace std; + +// strip path from __FILE__ +// DMH: Place somewhere more sensible when working +constexpr const char *str_end(const char *str) { return *str ? str_end(str + 1) : str; } +constexpr bool str_slant(const char *str) { return *str == '/' ? true : (*str ? str_slant(str + 1) : false); } +constexpr const char *r_slant(const char *str) { return *str == '/' ? (str + 1) : r_slant(str - 1); } +constexpr const char *file_name(const char *str) { return str_slant(str) ? r_slant(str_end(str)) : str; } + +// Define wrappers around function. May wish to place _ +// methods in a dsaX namespace later +void *pinned_malloc_(const char *func, const char *file, int line, size_t size); +#define pinned_malloc(size) pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *device_malloc_(const char *func, const char *file, int line, size_t size); +#define device_malloc(size) device_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size); +#define device_pinned_malloc(size) device_pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *safe_malloc_(const char *func, const char *file, int line, size_t size); +#define safe_malloc(size) safe_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *mapped_malloc_(const char *func, const char *file, int line, size_t size); +#define mapped_malloc(size) mapped_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *managed_malloc_(const char *func, const char *file, int line, size_t size); +#define managed_malloc(size) managed_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void managed_free_(const char *func, const char *file, int line, void *ptr); +#define managed_free(ptr) managed_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void device_free_(const char *func, const char *file, int line, void *ptr); +#define device_free(ptr) device_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void device_pinned_free_(const char *func, const char *file, int line, void *ptr); +#define device_pinned_free(ptr) device_pinned_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void host_free_(const char *func, const char *file, int line, void *ptr); +#define host_free(ptr) host_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +/* + @brief Get device view of a host-mapped pointer +*/ +void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr); +#define get_mapped_device_pointer(ptr) get_mapped_device_pointer_(__func__, file_name(__FILE__), __LINE__, ptr) + +// Create a mem_pool namespace to differentiate +// bewtween regular memory management methods +// and those utilising memory pooling +namespace mem_pool { + + /** + @brief Initialize the memory pool allocator + */ + void init(); + + /** + @brief Allocate device-memory. If free pre-existing allocation exists + reuse this. + @param size Size of allocation + @return Pointer to allocated memory + */ + void *device_malloc_(const char *func, const char *file, int line, size_t size); + + /** + @brief Virtual free of pinned-memory allocation. + @param ptr Pointer to be (virtually) freed + */ + void device_free_(const char *func, const char *file, int line, void *ptr); + + /** + @brief Allocate pinned-memory. + If a free pre-existing allocation exists, reuse this. + @param size Size of allocation + @return Pointer to allocated memory + */ + void *pinned_malloc_(const char *func, const char *file, int line, size_t size); + + /** + @brief Virtual free of pinned-memory allocation. + @param ptr Pointer to be (virtually) freed + */ + void pinned_free_(const char *func, const char *file, int line, void *ptr); + + /** + @brief Free all outstanding device-memory allocations. + */ + void flush_device(); + + /** + @brief Free all outstanding pinned-memory allocations. + */ + void flush_pinned(); +} + +#define pool_device_malloc(size) mem_pool::device_malloc_(__func__, __FILE__, __LINE__, size) +#define pool_device_free(ptr) mem_pool::device_free_(__func__, __FILE__, __LINE__, ptr) +#define pool_pinned_malloc(size) mem_pool::pinned_malloc_(__func__, __FILE__, __LINE__, size) +#define pool_pinned_free(ptr) mem_pool::pinned_free_(__func__, __FILE__, __LINE__, ptr) + diff --git a/include/dsaX_ptr.h b/include/dsaX_ptr.h new file mode 100644 index 0000000..de452f0 --- /dev/null +++ b/include/dsaX_ptr.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include "dsaX_malloc.h" + +/** + Object that stores a memory allocation with different views for + host or device. Depending on the nature of the underlying memory + type, both views may not be defined + + type defined views + DSAX_MEMORY_DEVICE device only + DSAX_MEMORY_DEVICE_PINNED device only + DSAX_MEMORY_HOST host only + DSAX_MEMORY_HOST_PINNED both + DSAX_MEMORY_MAPPED both (pinned to host) + DSAX_MEMORY_MANAGED both +*/ +class dsaX_ptr +{ + friend std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr); + dsaXMemoryType type = DSA_MEMORY_INVALID; /** Memory type of the allocation */ + size_t size = 0; /** Size of the allocation */ + bool pool = false; /** Is the allocation is pooled */ + void *device = nullptr; /** Device-view of the allocation */ + void *host = nullptr; /** Host-view of the allocation */ + bool reference = false; /** Is this a reference to another allocation */ + + /** + @brief Internal deallocation routine + */ + void destroy(); + +public: + dsaX_ptr() = default; + dsaX_ptr(dsaX_ptr &&) = default; + dsaX_ptr &operator=(dsaX_ptr &&); + dsaX_ptr(const dsaX_ptr &) = delete; + dsaX_ptr &operator=(const dsaX_ptr &) = delete; + + /** + @brief Constructor for dsaX_ptr + @param[in] type The memory type of the allocation + @param[in] size The size of the allocation + @param[in] pool Whether the allocation should be in the memory pool (default is true) + */ + dsaX_ptr(dsaXMemoryType type, size_t size, bool pool = true); + + /** + @brief Constructor for dsaX_ptr where we are wrapping a non-owned pointer + @param[in] ptr Raw base pointer + @param[in] type The memory type of the allocation + */ + dsaX_ptr(void *ptr, dsaXMemoryType type); + + /** + @brief Destructor for the dsaX_ptr + */ + virtual ~dsaX_ptr(); + + /** + @brief Specialized exchange function to use in place of + std::exchange when exchanging dsaX_ptr objects: moves obj to + *this, and moves new_value to obj + @param[in,out] obj + @param[in] new_value New value for obj to take + */ + void exchange(dsaX_ptr &obj, dsaX_ptr &&new_value); + + /** + @return Returns true if allocation is visible to the device + */ + bool is_device() const; + + /** + @return Returns true if allocation is visible to the host + */ + bool is_host() const; + + /** + Return view of the pointer. For mapped memory we return the device view. + */ + void *data() const; + + /** + Return the device view of the pointer + */ + void *data_device() const; + + /** + Return the host view of the pointer + */ + void *data_host() const; + + /** + Return if the instance is a reference rather than an allocation + */ + bool is_reference() const; +}; + +std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr); + diff --git a/include/enums.h b/include/enums.h new file mode 100644 index 0000000..aa86573 --- /dev/null +++ b/include/enums.h @@ -0,0 +1,76 @@ +#pragma once + +#define DSA_INVALID_ENUM (-0x7fffffff - 1) + +typedef enum dsaXMemoryType_s { + DSA_MEMORY_DEVICE, + DSA_MEMORY_DEVICE_PINNED, + DSA_MEMORY_HOST, + DSA_MEMORY_HOST_PINNED, + DSA_MEMORY_MAPPED, + DSA_MEMORY_MANAGED, + DSA_MEMORY_INVALID = DSA_INVALID_ENUM +} dsaXMemoryType; + +typedef enum dsaXError_t { + DSA_SUCCESS = 0, + DSA_ERROR = 1, + DSA_ERROR_UNINITIALIZED = 2, + DSA_ERROR_INVALID = DSA_INVALID_ENUM +} dsaXError; + +typedef enum dsaXBLASOperation_s { + DSA_BLAS_OP_N = 0, // No transpose + DSA_BLAS_OP_T = 1, // Transpose only + DSA_BLAS_OP_A = 2, // Adjoint imaginary, no transpose + DSA_BLAS_OP_C = 3, // Conjugate transpose + DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM +} dsaXBLASOperation; + +typedef enum dsaXBLASType_s { + DSA_BLAS_GEMM = 0, + DSA_BLAS_INVALID = DSA_INVALID_ENUM +} dsaXBLASType; + +typedef enum dsaXBLASLib_s { + DSA_BLAS_LIB_CUBLAS = 0, + DSA_BLAS_LIB_MAGMA = 1, + DSA_BLAS_LIB_CUTLASS = 2, + DSA_BLAS_LIB_TCC = 3, + DSA_BLAS_LIB_OPENBLAS = 4, + DSA_BLAS_LIB_NATIVE = 5, + DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM +} dsaXBLASLib; + +typedef enum dsaXBLASDataType_s { + DSA_BLAS_DATATYPE_H = 0, // Half + DSA_BLAS_DATATYPE_S = 1, // Single + DSA_BLAS_DATATYPE_D = 2, // Double + DSA_BLAS_DATATYPE_HC = 3, // Complex(half) + DSA_BLAS_DATATYPE_C = 4, // Complex(single) + DSA_BLAS_DATATYPE_Z = 5, // Complex(double) + DSA_BLAS_DATATYPE_4b_REAL = 6, // 4b sized real + DSA_BLAS_DATATYPE_2b_REAL = 7, // 2b sized real + DSA_BLAS_DATATYPE_4b_COMPLEX = 8, // Char sized complex (4b,4b) + DSA_BLAS_DATATYPE_2b_COMPLEX = 9, // 4b sized (2b,2b) + DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM +} dsaXBLASDataType; + +typedef enum dsaXBLASDataOrder_s { + DSA_BLAS_DATAORDER_ROW = 0, + DSA_BLAS_DATAORDER_COL = 1, + DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM +} dsaXBLASDataOrder; + +typedef enum dsaXMemcpyKind_s { + dsaXMemcpyHostToHost = 0, + dsaXMemcpyHostToDevice = 1, + dsaXMemcpyDeviceToHost = 2, + dsaXMemcpyDeviceToDevice = 3, + dsaXMemcpyHostToHostAsync = 4, + dsaXMemcpyHostToDeviceAsync = 5, + dsaXMemcpyDeviceToHostAsync = 6, + dsaXMemcpyDeviceToDeviceAsync = 7, + dsaXMemcpyInvalid = DSA_INVALID_ENUM +} dsaXMemcpyKind; + diff --git a/include/fast_time_domain.h b/include/fast_time_domain.h new file mode 100644 index 0000000..98ce8ff --- /dev/null +++ b/include/fast_time_domain.h @@ -0,0 +1,145 @@ +#pragma once + +#include "enums.h" +#include "params.h" +#include "timer.h" + +using ms = std::chrono::microseconds; +using hrc = std::chrono::high_resolution_clock; + +// define structures that carry around memory pointers +// and metric. +// DMH: make a base and inherit into corr and bf +typedef struct corr_handle_s { + + // initial data and streams + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + + // DMH: fix me + void *d_idxs; + + // correlator pointers + // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times] + void *d_r, *d_i; //half + // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] + void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half + // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] + float *d_output; + + dsaXCorrParam corr_param; + + double device_compute_flops; + double host_compute_flops; + + double H2D_bytes; + double D2H_bytes; + double D2D_bytes; + double H2H_bytes; + + // See 'using' at top of file for ms, hrc + timer::Timer dev_compute_timer; + timer::Timer dev_malloc_timer; + timer::Timer dev_memset_timer; + + timer::Timer H2D_timer; + timer::Timer D2H_timer; + timer::Timer D2D_timer; + timer::Timer H2H_timer; + + timer::Timer host_compute_timer; + timer::Timer host_malloc_timer; + timer::Timer host_memset_timer; + timer::Timer host_copy_timer; + +} corr_handle; + +typedef struct bf_handle_s { + + // beamformer pointers + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + char *d_big_input; + void *d_br, *d_bi; //half + void *weights_r, *weights_i; //weights: [arm, tactp, b] //half + void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half + unsigned char *d_bigpower; //output: [b, tc] + float *d_scf; // scale factor per beam + float *d_chscf; + float *h_winp; + int *flagants, nflags; + float *h_freqs, *d_freqs; + + // timing (old) + float cp, prep, cubl, outp; + + // See 'using' at top of file ms, hrc + timer::Timer dev_compute_timer; + timer::Timer dev_malloc_timer; + timer::Timer dev_memset_timer; + + timer::Timer H2D_timer; + timer::Timer D2H_timer; + + timer::Timer host_compute_timer; + timer::Timer host_malloc_timer; + timer::Timer host_memset_timer; + timer::Timer host_copy_timer; + +} bf_handle; + +// Deprecated function, remove after development +void dcorrelator(corr_handle *d); + +// Base class +class dsaXBase { + + private: + protected: + + public: + dsaXBase(); + ~dsaXBase(); + +}; + +class Correlator : public dsaXBase { + +private: +protected: + + corr_handle d; + dsaXCorrParam corr_param; + dsaXBLASParam blas_param; + + uint64_t flops; + +public: + + // Constructor + // Initialise device memory if CUDA enabled + // make host memory if CPU + Correlator(const dsaXCorrParam *corr_param); + + // Compute the FX correlator on input, + // place result in output. + void compute(void *output, void *input); + + ~Correlator(); +}; + + +void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams); +void destroyDsaXCorrDeviceMemory(corr_handle *d); +void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int n_streams); + +void initBLAS(); +void destroyBLAS(); + +void initStreams(unsigned int n); +void destroyStreams(); + +void computeIndices(corr_handle *d); +void reorderCorrelatorOutput(corr_handle *d, int stream); +void reorderCorrelatorInput(corr_handle *d, int stream); + diff --git a/include/interface.h b/include/interface.h new file mode 100644 index 0000000..96442d1 --- /dev/null +++ b/include/interface.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include "dsaX.h" + +// DMH: decorate these with Doxygen +void dsaXCorrelator(void *input_data, void *output_data); + +void reorderCorrInput(corr_handle *d, int stream = 0); + +void reorderCorrOutput(corr_handle *d, int stream = 0); + +void transposeInputBeamformer(double *input, double *output, std::vector &dimBlock, std::vector &dimGrid); + +void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector &dimBlock, std::vector &dimGrid); + +void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb); + +void sumBeam(unsigned char *input, float *output, int blocks, int tpb); + +void dsaXInitStream(unsigned int n_streams); + +//void *dsaXHostRegister(size_t size); diff --git a/include/magma_headers.h b/include/magma_headers.h new file mode 100644 index 0000000..e9750c8 --- /dev/null +++ b/include/magma_headers.h @@ -0,0 +1,5 @@ +#pragma once + +#if defined (DSA_XENGINE_ENABLE_MAGMA) +#include "magma_v2.h" +#endif diff --git a/include/magma_interface.h b/include/magma_interface.h new file mode 100644 index 0000000..12f0cc7 --- /dev/null +++ b/include/magma_interface.h @@ -0,0 +1,4 @@ +#pragma once +#include "dsaX.h" + +void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); diff --git a/include/params.h b/include/params.h new file mode 100644 index 0000000..08ff440 --- /dev/null +++ b/include/params.h @@ -0,0 +1,62 @@ +#pragma once + +#include + +#include "enums.h" + +// Structure that carries BLAS parameters +// This should be able to communicate to all +// backend choices of BLAS library +typedef struct dsaXBLASParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaXBLASType blas_type; /**< Type of BLAS computation to perform */ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ + + // GEMM params + dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + int m; /**< number of rows of matrix op(A) and C. */ + int n; /**< number of columns of matrix op(B) and C. */ + int k; /**< number of columns of op(A) and rows of op(B). */ + int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ + int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ + int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ + long long int a_offset; /**< position of the A array from which begin read/write. */ + long long int b_offset; /**< position of the B array from which begin read/write. */ + long long int c_offset; /**< position of the C array from which begin read/write. */ + long long int a_stride; /**< stride of the A array in strided(batched) mode */ + long long int b_stride; /**< stride of the B array in strided(batched) mode */ + long long int c_stride; /**< stride of the C array in strided(batched) mode */ + std::complex alpha; /**< scalar used for multiplication. */ + std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + + // Common params + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} dsaXBLASParam; + +// Structure that carries Correlator class parameters +typedef struct dsaXCorrParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + + int n_streams; /**< The number streams over which to compute input data */ + +} dsaXCorrParam; + +// Parameter struct helper functions for user +const char *getBLASLibString(dsaXBLASLib lib); +const char *getBLASDataTypeString(dsaXBLASDataType type); +const char *getBLASDataOrderString(dsaXBLASDataOrder order); +void printDsaXBLASParam(const dsaXBLASParam param); +void printDsaXCorrParam(const dsaXCorrParam param); + +// Create params +dsaXCorrParam newDsaXCorrParam(void); diff --git a/include/psrdada_utils.h b/include/psrdada_utils.h new file mode 100644 index 0000000..2b60bf3 --- /dev/null +++ b/include/psrdada_utils.h @@ -0,0 +1,16 @@ +#pragma once + +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include "enums.h" + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); + +int dada_bind_thread_to_core(int core); diff --git a/include/timer.h b/include/timer.h new file mode 100644 index 0000000..6607d5d --- /dev/null +++ b/include/timer.h @@ -0,0 +1,85 @@ +// Copyright (C) 2023 by Mark Melton +// + +#pragma once +#include +#include + +namespace timer { + + template + inline void doNotOptimizeAway(const T& val) { + asm volatile("" : : "r,m"(val) : "memory"); + } + +#ifdef __clang__ + template + inline void doNotOptimizeAway(T& value) { + asm volatile("" : "+r,m"(value) : : "memory"); + } +#else + template + inline void doNotOptimizeAway(T& value) { + asm volatile("" : "+m,r"(value) : : "memory"); + } +#endif + + inline void doNotReorderBarrier() { + std::atomic_signal_fence(std::memory_order_acq_rel); + } + + /// The Timer class template implements a timer designed for minimal + /// overhead, ad-hoc timing of code regions including micro-timing + /// down to single machine instructions. + template + class Timer { + public: + using TimePoint = typename Clock::time_point; + + /// Run the supplied `code` in a loop `n` times. + template + Timer& run(size_t n, Code&& code) { + start(); + for (auto i = 0ul; i < n; ++i) { + code(); + } + stop(n); + return *this; + } + + /// Start the timer. + void start() { + start_ = Clock::now(); + } + + /// Stop the timer indicating `n` operations. + auto stop(size_t n = 1) { + auto end = Clock::now(); + iterations_ += n; + elapsed_ += std::chrono::duration_cast(end - start_); + return elapsed_; + } + + /// Return the average number of nanoseconds per operation. + auto elapsed_per_iteration() const { + return iterations_ > 0 ? (double)elapsed_.count() / iterations_ : 0.0; + } + + /// Return the elapsed duration. + auto elapsed() const { + return elapsed_; + } + + /// Return the iterations. + auto iterations() const { + return iterations_; + } + + private: + TimePoint start_{}; + Duration elapsed_{}; + size_t iterations_{}; + }; + +}; // timer diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..96a7004 --- /dev/null +++ b/include/utils.h @@ -0,0 +1,10 @@ +#pragma once + +#include "params.h" +#include "timer.h" + +void dsaXmemset(void *array, int ch, size_t n); + +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream = 0); + +void dsaXDeviceSynchronize(); diff --git a/legacy/10_planar_complex.cu b/legacy/10_planar_complex.cu new file mode 100644 index 0000000..9e0915d --- /dev/null +++ b/legacy/10_planar_complex.cu @@ -0,0 +1,567 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex GEMM + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting + the batched strided mode. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile + size, and alignment. This can result in long compile times. + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn + + $ make 10_planar_complex + + $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "10_planar_complex example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/10_planar_complex/10_planar_complex --batch=7 --m=1024 --n=512 --k=1024 \\\n" + << " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + using ElementA = cutlass::half_t; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = cutlass::half_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = cutlass::half_t; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched strided GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + ElementA *ptr_A = tensor_A.get(); + ElementB *ptr_B = tensor_B.get(); + ElementC *ptr_C = tensor_C.get(); + ElementC *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMMs + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex( + problem_size.m(), // GEMM M dimension + problem_size.n(), // GEMM N dimension + problem_size.k(), // GEMM K dimension + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + ptr_A, // Pointer to real part of A matrix + ptr_A + imag_stride_A, // Pointer to imaginary part of A matrix + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + ptr_B, // Pointer to real part of B matrix + ptr_B + imag_stride_B, // Pointer to imaginary part of B matrix + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C, // Pointer to real part of C matrix + ptr_C + imag_stride_C, // Pointer to imaginary part of C matrix + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D, // Pointer to real part of D matrix + ptr_D + imag_stride_D, // Pointer to imaginary part of D matrix + ldd, // Leading dimension of real part of D matrix + ldd, // Leading dimension of imaginary part of D matrix + + batch_count, // Number of batched elements + + batch_stride_A, // Stride between batches of real parts of A matrix + batch_stride_A, // Stride between batches of imaginary parts of A matrix + + batch_stride_B, // Stride between batches of real parts of B matrix + batch_stride_B, // Stride between batches of imaginary parts of B matrix + + batch_stride_C, // Stride between batches of real parts of C matrix + batch_stride_C, // Stride between batches of imaginary parts of C matrix + + batch_stride_D, // Stride between batches of real parts of D matrix + batch_stride_D // Stride between batches of imaginary parts of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMMs are complete + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + ElementC epsilon = 0.1_hf; + ElementC nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/legacy/11_planar_complex_array.cu b/legacy/11_planar_complex_array.cu new file mode 100644 index 0000000..ba94b60 --- /dev/null +++ b/legacy/11_planar_complex_array.cu @@ -0,0 +1,628 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex Array Example + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which + execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays + in global memory. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn + + $ make 11_planar_complex_array + + $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "11_planar_complex_array example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + // Half-precision input and output + using Element = cutlass::half_t; + + // Configurations for layouts and internal computation + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + cutlass::DeviceAllocation ptr_A_real; + cutlass::DeviceAllocation ptr_A_imag; + cutlass::DeviceAllocation ptr_B_real; + cutlass::DeviceAllocation ptr_B_imag; + cutlass::DeviceAllocation ptr_C_real; + cutlass::DeviceAllocation ptr_C_imag; + cutlass::DeviceAllocation ptr_D_real; + cutlass::DeviceAllocation ptr_D_imag; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); + + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + Element *ptr_A = tensor_A.get(); + Element *ptr_B = tensor_B.get(); + Element *ptr_C = tensor_C.get(); + Element *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Configure pointers in global memory + // + + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = { + { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} + }; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + cudaError_t error = cudaMemcpy( + tensor.ptr_real + idx, + &ptr_real, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + + error = cudaMemcpy( + tensor.ptr_imag + idx, + &ptr_imag, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + } + } + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex_array( + + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + Element, LayoutA, + Element, LayoutB, + Element, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/legacy/CMakeLists.txt b/legacy/CMakeLists.txt new file mode 100644 index 0000000..b456550 --- /dev/null +++ b/legacy/CMakeLists.txt @@ -0,0 +1,121 @@ +enable_language(CUDA) + +include_directories(../include) +include_directories(${PSRDada_SOURCE_DIR}/src) +include_directories(${xGPU_SOURCE_DIR}/src) + +set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + +# DSA Fast Time Domain functions +#------------------------------- +add_executable(test_write test_write.c) +target_link_libraries(test_write ${PSRDada_LIB}) + +add_executable(test_read test_read.c) +target_link_libraries(test_read ${PSRDada_LIB}) + +add_executable(dsaX_trigger dsaX_trigger.c) +target_link_libraries(dsaX_trigger ${PSRDada_LIB}) + +add_executable(dsaX_filTrigger dsaX_filTrigger.c) +target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) + +# DMH: Has a 'sigproc' dependency, low priority +if(0) + add_executable(splice_offline_beams splice_offline_beams.c) + target_link_libraries(splice_offline_beams ${PSRDada_LIB}) + + add_executable(dsaX_writeFil dsaX_writeFil.c) + target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) + + add_executable(dsaX_splice dsaX_splice.c) + target_link_libraries(dsaX_splice ${PSRDada_LIB}) + + add_executable(gpu_flagger gpu_flagger.cu) + target_link_libraries(gpu_flagger ${PSRDada_LIB}) +endif() + +add_executable(dsaX_store dsaX_store.c) +target_link_libraries(dsaX_store ${PSRDada_LIB}) + +add_executable(dsaX_fluff dsaX_fluff.c) +target_link_libraries(dsaX_fluff ${PSRDada_LIB}) + +# DMH: intrinsics compilation error +#add_executable(dsaX_reorder dsaX_reorder.c) +#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) + +# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: +#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] +# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; +add_executable(dsaX_nicdb dsaX_nicdb.c) +target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) + +add_executable(dsaX_dbnic dsaX_dbnic.c) +target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) + +add_executable(dsaX_capture dsaX_capture.c) +target_link_libraries(dsaX_capture ${PSRDada_LIB}) + +add_executable(dsaX_capture_thread dsaX_capture_thread.c) +target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) + +add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) +target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) + +add_executable(dsaX_split dsaX_split.c) +target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) + +add_executable(dsaX_merge dsaX_merge.c) +target_link_libraries(dsaX_merge ${PSRDada_LIB}) + +add_executable(dsaX_simplesplit dsaX_simplesplit.c) +target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) + +add_executable(dsaX_fake dsaX_fake.c) +target_link_libraries(dsaX_fake ${PSRDada_LIB}) + +add_executable(dsaX_splitup dsaX_splitup.c) +target_link_libraries(dsaX_splitup ${PSRDada_LIB}) + +add_executable(dsaX_copydb dsaX_copydb.c) +target_link_libraries(dsaX_copydb ${PSRDada_LIB}) + +# DMH: fitsio dependency +if(0) + add_executable(dsaX_writevis dsaX_writevis.c) + target_link_libraries(dsaX_writevis ${PSRDada_LIB}) +endif() + +# DMH: XGPU dependencies +add_executable(dsaX_wrangle dsaX_wrangle.c) +target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB}) + +add_executable(dsaX_testdada dsaX_testdada.c) +target_link_libraries(dsaX_testdada ${PSRDada_LIB}) + +add_executable(dsaX_xgpu dsaX_xgpu.cu) +target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) + +add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) +target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + +add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) +target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) + +add_executable(fil2dada fil2dada.c) +target_link_libraries(fil2dada ${PSRDada_LIB}) + +add_executable(dumpfil dumpfil.c) +target_link_libraries(dumpfil ${PSRDada_LIB}) + +add_executable(dsaX_beamformer dsaX_beamformer.cu) +target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) +target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) +target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) +#------------------------------------------------------ diff --git a/src/Makefile b/legacy/Makefile similarity index 93% rename from src/Makefile rename to legacy/Makefile index bbca4e0..4cc2fee 100644 --- a/src/Makefile +++ b/legacy/Makefile @@ -4,13 +4,13 @@ CC=gcc CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc CDEPS1=dsaX_def.h dsaX_capture_manythread.h CDEPS2=dsaX_def.h dsaX_capture.h -LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu +LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib #-lcfitsio -lsigproc -lxgpu #LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap #CDEPS3=dsaX_def.h dsaX_capture_pcap.h CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++ -CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 +CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/dmhowart/install/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 -L/home/dmhowart/install/lib .DEFAULT_GOAL := all @@ -63,7 +63,6 @@ dsaX_reorder.o: dsaX_reorder.c $(CDEPS1) dsaX_reorder: dsaX_reorder.o $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1) $(CC) -c -o $@ $< $(CFLAGS1) diff --git a/legacy/correlator_header_dsaX.txt b/legacy/correlator_header_dsaX.txt new file mode 100644 index 0000000..c8b86e9 --- /dev/null +++ b/legacy/correlator_header_dsaX.txt @@ -0,0 +1,38 @@ +ACC_LEN 1 +BANDWIDTH -250 +BW -250 +CFREQ 1405 +CHAN_AV 0 +DEC 00:00:00.000 +DSB 0 +FILE_SIZE 2415919104 +FREQ 1405.000000 +FSCRUNCH 1 +HDR_SIZE 4096 +HDR_VERSION 1.0 +INSTRUMENT DSAX +MODE RAW +NBEAM 1 +NBIT 4 +NCHAN 2048 +NDIM 1 +NPOL 2 +N_PROD 1 +OBSERVER DSA +OBS_OFFSET 0 +OBS_UNIT SECONDS +OBS_VAL 0000.0000 +PID P000 +RA 00:00:00.000 +RECEIVER SANDY +RESOLUTION 4096 +SOURCE TEST +TRANSFER_SIZE 126562550000000 +TELESCOPE DSA-10 +TSAMP 64 +TSCRUNCH 1 +ANTENNAS 1-2-5-3 +NANT 2 +UTC_START 2015-08-07-17:07:28 +FILE_NUMBER 0 + diff --git a/src/dsaX_beamformer.cu b/legacy/dsaX_beamformer.cu similarity index 99% rename from src/dsaX_beamformer.cu rename to legacy/dsaX_beamformer.cu index 5efcfca..afdda70 100644 --- a/src/dsaX_beamformer.cu +++ b/legacy/dsaX_beamformer.cu @@ -30,6 +30,9 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ + +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -811,7 +814,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -822,7 +825,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -871,7 +874,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; diff --git a/src/dsaX_beamformer.cu.wrk1 b/legacy/dsaX_beamformer.cu.wrk1 similarity index 100% rename from src/dsaX_beamformer.cu.wrk1 rename to legacy/dsaX_beamformer.cu.wrk1 diff --git a/src/dsaX_beamformer_offline.cu b/legacy/dsaX_beamformer_offline.cu similarity index 99% rename from src/dsaX_beamformer_offline.cu rename to legacy/dsaX_beamformer_offline.cu index 13eab5e..c122d46 100644 --- a/src/dsaX_beamformer_offline.cu +++ b/legacy/dsaX_beamformer_offline.cu @@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -723,7 +725,7 @@ int main (int argc, char *argv[]) { uint64_t block_out = 15*48*512*256; char * block; block = (char *)malloc(sizeof(char)*block_size); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; uint64_t nbytes_per_out = block_out / nints; diff --git a/src/dsaX_beamformer_passon.cu b/legacy/dsaX_beamformer_passon.cu similarity index 99% rename from src/dsaX_beamformer_passon.cu rename to legacy/dsaX_beamformer_passon.cu index 7c8c254..818c28a 100644 --- a/src/dsaX_beamformer_passon.cu +++ b/legacy/dsaX_beamformer_passon.cu @@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -721,7 +723,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -732,7 +734,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -743,7 +745,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -809,7 +811,7 @@ int main (int argc, char *argv[]) { uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; diff --git a/src/dsaX_bfCorr.cu b/legacy/dsaX_bfCorr.cu similarity index 95% rename from src/dsaX_bfCorr.cu rename to legacy/dsaX_bfCorr.cu index 94bee5e..0fabe1e 100644 --- a/src/dsaX_bfCorr.cu +++ b/legacy/dsaX_bfCorr.cu @@ -47,6 +47,26 @@ using std::endl; /* global variables */ int DEBUG = 0; +__device__ void inspectPackedDataInKernel(char input, int i) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im); +} + +void inspectPackedData(char input, int i, bool non_zeros) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(non_zeros) { + if(re != 0 || im != 0) + std::cout << "val["<> 4)); ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx])); } @@ -269,18 +290,21 @@ __global__ void transpose_matrix_char(char * idata, char * odata) { int y = blockIdx.y * 32 + threadIdx.y; int width = gridDim.x * 32; - for (int j = 0; j < 32; j += 8) + for (int j = 0; j < 32; j += 8) { tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + __syncthreads(); x = blockIdx.y * 32 + threadIdx.x; // transpose block offset y = blockIdx.x * 32 + threadIdx.y; width = gridDim.y * 32; - for (int j = 0; j < 32; j += 8) + for (int j = 0; j < 32; j += 8) { odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - + //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x); + } } // arbitrary transpose kernel @@ -319,7 +343,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) { // transpose input data dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix_char<<>>(input,tx); + transpose_matrix_char<<>>(input, tx); + // DMH good /* // set up for geam cublasHandle_t cublasH = NULL; @@ -497,7 +522,6 @@ void dcorrelator(dmem * d) { d->prep += (float)(end - begin) / CLOCKS_PER_SEC; // set up for gemm - begin = clock(); cublasHandle_t cublasH = NULL; cudaStream_t stream = NULL; @@ -526,6 +550,10 @@ void dcorrelator(dmem * d) { const int batchCount = NCHAN_PER_PACKET*2*2*halfFac; // run strided batched gemm + // M^* M^T + // (a - ib)(a + ib)^T + // (aaT + bbT) + i(abT - bTa) + // ac cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, &alpha,d->d_r,lda,strideA, @@ -1166,6 +1194,7 @@ int main (int argc, char *argv[]) { // allocate input d.h_input = (char *)malloc(sizeof(char)*in_block_size); + std::cout << "Size of input = " << in_block_size << std::endl; // loop over reps and chunks for (int reps=0; reps0) rewind(fin); fread(d.h_input+chunks*rd_size,rd_size,1,fin); + std::cout << "Input peek " << std::endl; + //for (int i=0; i<8; i++) inspectPackedData(d.h_input[i], i); + // run correlator or beamformer, and output data if (bf==0) { if (DEBUG) syslog(LOG_INFO,"run correlator"); @@ -1182,10 +1214,13 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO,"copy to host"); output_size = NBASE*NCHAN_PER_PACKET*2*2*4; output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + cudaMemcpy(output_data, d.d_output, output_size, cudaMemcpyDeviceToHost); + + std::cout << "Output peek " << std::endl; + for(int i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +#define DSAX_UDPDB_BUF_CLEAR = 0 +#define DSAX_UDPDB_BUF_FULL = 1 + +/* socket buffer for receiving udp data */ +typedef struct { + + int fd; // FD of the socket + size_t bufsz; // size of socket buffer + char * buf; // the socket buffer + int have_packet; // + size_t got; // amount of data received + +} dsaX_sock_t; + +dsaX_sock_t * dsax_Xnit_sock (); + +void dsaX_free_sock(dsaX_sock_t* b); + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + multilog_t * log; // DADA logging interface + int verbose; // verbosity flag + + dsaX_sock_t * sock; // UDP socket for data capture + int port; // port to receive UDP data + int control_port; // port to receive control commands + char * interface; // IP Address to accept packets on + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // datablock management + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * block; // pointer to current datablock buffer + uint64_t block_start_byte; // seq_byte of first byte for the block + uint64_t block_end_byte; // seq_byte of first byte of final packet of the block + uint64_t block_count; // number of packets in this block + char * tblock; // area of memory to write to + + // packets + unsigned capture_started; // flag for start of UDP data + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + + uint64_t bytes_to_acquire; + double mb_rcv_ps; + double mb_drp_ps; + double mb_free; + double mb_total; + uint64_t rcv_sleeps; + + uint64_t last_seq; // most recently received seq number + uint64_t last_byte; // most recently received byte + struct timeval timeout; + + uint64_t n_sleeps; + uint64_t ooo_packets; + + int recv_core; + +} udpdb_t; + + +int dsaX_udpdb_init_receiver (udpdb_t * ctx); +void dsaX_udpdb_reset_receiver (udpdb_t * ctx); +int dsaX_udpdb_destroy_receiver (udpdb_t * ctx); +int dsaX_udpdb_open_buffer (udpdb_t * ctx); +int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod); +int dsaX_udpdb_new_buffer (udpdb_t * ctx); +int dsaX_udpdb_increment (udpdb_t * ctx); + +// allocate required resources for data capture +int dsaX_udpdb_prepare (udpdb_t * ctx); + +// move to a state where data acquisition can begin +time_t dsaX_dpdb_start (udpdb_t * ctx, char * header); + +// main workhorse function to receive data for a single observation +void * dsaX_udpdb_receive_obs (void * ctx); + +// close the datablock signifying end of data +int udpdb_stop_function (udpdb_t* ctx); + +void usage(); +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); + +#endif diff --git a/src/dsaX_capture_manythread.c b/legacy/dsaX_capture_manythread.c similarity index 99% rename from src/dsaX_capture_manythread.c rename to legacy/dsaX_capture_manythread.c index 06f508a..b9f14bd 100644 --- a/src/dsaX_capture_manythread.c +++ b/legacy/dsaX_capture_manythread.c @@ -427,7 +427,7 @@ void control_thread (void * arg) { /* * Thread to capture data */ -void recv_thread(void * arg) { +int recv_thread(void * arg) { udpdb_t * udpdb = (udpdb_t *) arg; int thread_id = udpdb->thread_id; @@ -528,7 +528,7 @@ void recv_thread(void * arg) { } else // we received a packet of the WRONG size, ignore it { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); } } timeouts = 0; @@ -953,7 +953,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); if (DEBUG) syslog(DEBUG,"Created hdu"); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { diff --git a/src/dsaX_capture_manythread.c.bak b/legacy/dsaX_capture_manythread.c.bak similarity index 100% rename from src/dsaX_capture_manythread.c.bak rename to legacy/dsaX_capture_manythread.c.bak diff --git a/legacy/dsaX_capture_manythread.h b/legacy/dsaX_capture_manythread.h new file mode 100644 index 0000000..3c96648 --- /dev/null +++ b/legacy/dsaX_capture_manythread.h @@ -0,0 +1,119 @@ +/*************************************************************************** + * + * Copyright (C) 2009 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ****************************************************************************/ + +#ifndef __DSAX_UDPDB_THREAD_H +#define __DSAX_UDPDB_THREAD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +/* socket buffer for receiving udp data */ +// this is initialised in each recv thread +typedef struct { + + int fd; // FD of the socket + size_t bufsz; // size of socket buffer + char * buf; // the socket buffer + int have_packet; // + size_t got; // amount of data received + +} dsaX_sock_t; + +dsaX_sock_t * dsaX_init_sock (); +void dsaX_free_sock(dsaX_sock_t* b); + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +// structure for write thread +// tblock must be shared +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * block; // pointer to current datablock buffer + char * tblock; // area of memory to write to + int thread_id; + +} dsaX_write_t; + +// structure for stats thread +// both are shared between all recv structures and this one +// last_seq is also shared +typedef struct { + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t * last_seq; // most recently received seq number + +} dsaX_stats_t; + + +// structure for receive thread +// tblock, packets, bytes, last_seq, block_start_byte, block_end_byte, block_count, capture_started +typedef struct { + + multilog_t * log; // DADA logging interface + int verbose; // verbosity flag + + int port; // port to receive UDP data + int control_port; // port to receive control commands + char * interface; // IP Address to accept packets on + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // datablock management + uint64_t * block_start_byte; // seq_byte of first byte for the block + uint64_t * block_end_byte; // seq_byte of first byte of final packet of the block + uint64_t * block_count; // number of packets in this block + uint64_t hdu_bufsz; + char * tblock; // area of memory to write to + + // packets + unsigned * capture_started; // flag for start of UDP data + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t rcv_sleeps; + + uint64_t * last_seq; // most recently received seq number + struct timeval timeout; + int thread_id; + +} udpdb_t; + +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); + +#endif diff --git a/src/dsaX_capture_pcap.c b/legacy/dsaX_capture_pcap.c similarity index 100% rename from src/dsaX_capture_pcap.c rename to legacy/dsaX_capture_pcap.c diff --git a/legacy/dsaX_capture_pcap.h b/legacy/dsaX_capture_pcap.h new file mode 100644 index 0000000..f037f75 --- /dev/null +++ b/legacy/dsaX_capture_pcap.h @@ -0,0 +1,83 @@ +/*************************************************************************** + * + * Copyright (C) 2009 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +// structure for all threads +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * tblock; + uint64_t tblock_idx; + char * temp_buffers; + uint64_t * temp_seq_byte; + int temp_idx; + int thread_id; + uint64_t block_start_byte; + uint64_t block_end_byte; + uint64_t block_count; + int nblocks_written; + + int verbose; // verbosity flag + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // packets + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + + uint64_t last_seq; // most recently received seq number + +} dsaX_t; + +// structure for stats thread +// both are shared between all recv structures and this one +// last_seq is also shared +typedef struct { + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t * last_seq; // most recently received seq number + +} dsaX_stats_t; + + +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); diff --git a/src/dsaX_capture_thread.c b/legacy/dsaX_capture_thread.c similarity index 99% rename from src/dsaX_capture_thread.c rename to legacy/dsaX_capture_thread.c index 3cc0c96..49019be 100644 --- a/src/dsaX_capture_thread.c +++ b/legacy/dsaX_capture_thread.c @@ -518,7 +518,7 @@ void control_thread (void * arg) { /* * Thread to capture data */ -void recv_thread(void * arg) { +int recv_thread(void * arg) { // set affinity const pthread_t pid = pthread_self(); @@ -604,7 +604,7 @@ void recv_thread(void * arg) { } else // we received a packet of the WRONG size, ignore it { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); } } timeouts = 0; @@ -753,7 +753,7 @@ void recv_thread(void * arg) { /* * Thread to write data */ -void write_thread(void * arg) { +int write_thread(void * arg) { // set affinity const pthread_t pid = pthread_self(); @@ -964,7 +964,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); if (DEBUG) syslog(DEBUG,"Created hdu"); dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); if (dada_hdu_connect (hdu_out) < 0) { diff --git a/src/dsaX_copydb.c b/legacy/dsaX_copydb.c similarity index 97% rename from src/dsaX_copydb.c rename to legacy/dsaX_copydb.c index 054ee94..7714038 100644 --- a/src/dsaX_copydb.c +++ b/legacy/dsaX_copydb.c @@ -160,7 +160,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -171,7 +171,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -220,7 +220,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block; uint64_t written, block_id; diff --git a/src/cuda_correlator.cu b/legacy/dsaX_cuda_correlator.cu similarity index 97% rename from src/cuda_correlator.cu rename to legacy/dsaX_cuda_correlator.cu index eb0882c..3bebd09 100644 --- a/src/cuda_correlator.cu +++ b/legacy/dsaX_cuda_correlator.cu @@ -1,6 +1,8 @@ // -*- c++ -*- /* will run xgpu */ /* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -36,7 +38,7 @@ using std::endl; #include "dada_affinity.h" #include "ascii_header.h" #include "dsaX_def.h" -#include "cube/cube.h" +//#include "cube/cube.h" #include "xgpu.h" @@ -222,7 +224,8 @@ int main(int argc, char** argv) { #ifdef RUNTIME_STATS clock_gettime(CLOCK_MONOTONIC, &tic); #endif - xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp); + //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp); + xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp); #ifdef RUNTIME_STATS clock_gettime(CLOCK_MONOTONIC, &toc); #endif diff --git a/legacy/dsaX_cutlass_interface.cu b/legacy/dsaX_cutlass_interface.cu new file mode 100644 index 0000000..fc68d55 --- /dev/null +++ b/legacy/dsaX_cutlass_interface.cu @@ -0,0 +1,315 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "dsaX_cutlass_interface.h" + +DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); +} + +// DMH: Replace this with data from DSA-FTD +void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { + + if(testing) { + uint64_t seed = 1234; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } else { + // DMH: construct DSA-FTD interface data transfer interface + } + + ptr_A = tensor_A.get(); + ptr_B = tensor_B.get(); + ptr_C = tensor_C.get(); + ptr_D = tensor_D.get(); + + batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + +} + +Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { + + Result result; + + initialize(); + + // Configure pointers in global memory + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + cudaError_t error; + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + } + } + + + cudaEvent_t events[2]; + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Run profiling loop + //------------------- + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + for (int iter = 0; iter < options.iterations; ++iter) { + + result.status = handle.gemm_planar_complex_array( + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // Compute reference in device code + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + // Define the GEMM through templates + GemmPlanarComplex + (problem_size, options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = BlockCompareRelativelyEqual + ( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) std::cout << "Reference check passed." << std::endl; + else std::cerr << "Error - reference check failed." << std::endl; + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; +} + + int main(int argc, char const **args) { + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + Options options; + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Compute GEMM + DSA_FTD_ComplexGEMM_CUTLASS gemm(options); + gemm.testing = true; + Result result = gemm.run(options); + + return result.passed ? 0 : -1; +} + diff --git a/legacy/dsaX_cutlass_interface.h b/legacy/dsaX_cutlass_interface.h new file mode 100644 index 0000000..5aa753e --- /dev/null +++ b/legacy/dsaX_cutlass_interface.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/library/handle.h" + +using namespace cutlass; +using namespace gemm; +using namespace library; +using namespace layout; +using namespace reference; +using namespace device; + +// Result structure +struct Result { + + double runtime_ms; + double gflops; + Status status; + cudaError_t error; + bool passed; + + Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +// Command line options parsing (testing) +struct Options { + + bool help; + GemmCoord problem_size; + int batch_count; + complex alpha; + complex beta; + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(false), + iterations(20), + alpha(1), + beta() { } + + // Parses the command line + void parse(int argc, char const **args) { + + CommandLine cmd(argc, args); + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "dsaX_cutlass_interface\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/// Performance test environment for planar complex +class DSA_FTD_ComplexGEMM_CUTLASS { + + // Half-precision input and output + using Element = half_t; + + // Configurations for layouts and internal computation + using LayoutA = ColumnMajor; + using LayoutB = ColumnMajor; + using LayoutC = ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + Handle handle; + + GemmCoord problem_size; + int batch_count; + DeviceAllocation tensor_A; + DeviceAllocation tensor_B; + DeviceAllocation tensor_C; + DeviceAllocation tensor_D; + DeviceAllocation tensor_D_ref; + + DeviceAllocation ptr_A_real; + DeviceAllocation ptr_A_imag; + DeviceAllocation ptr_B_real; + DeviceAllocation ptr_B_imag; + DeviceAllocation ptr_C_real; + DeviceAllocation ptr_C_imag; + DeviceAllocation ptr_D_real; + DeviceAllocation ptr_D_imag; + + Element *ptr_A; + Element *ptr_B; + Element *ptr_C; + Element *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + typename LayoutA::Stride::Index lda; + typename LayoutB::Stride::Index ldb; + typename LayoutC::Stride::Index ldc; + typename LayoutC::Stride::Index ldd; + + int64_t imag_stride_A; + int64_t imag_stride_B; + int64_t imag_stride_C; + int64_t imag_stride_D; + +public: + // Constructors + DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); + DSA_FTD_ComplexGEMM_CUTLASS(); + + // Methods + void initialize(); + Result run(Options const &options); + + bool testing; +}; + diff --git a/src/dsaX_dbnic.c b/legacy/dsaX_dbnic.c similarity index 98% rename from src/dsaX_dbnic.c rename to legacy/dsaX_dbnic.c index 40407ee..83e3e4a 100644 --- a/src/dsaX_dbnic.c +++ b/legacy/dsaX_dbnic.c @@ -261,7 +261,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -294,7 +294,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size); + syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size); uint64_t bytes_read = 0; char *block; uint64_t written, block_id; diff --git a/src/dsaX_dbnic.c.bak b/legacy/dsaX_dbnic.c.bak similarity index 100% rename from src/dsaX_dbnic.c.bak rename to legacy/dsaX_dbnic.c.bak diff --git a/src/dsaX_def.h b/legacy/dsaX_def.h similarity index 100% rename from src/dsaX_def.h rename to legacy/dsaX_def.h diff --git a/src/dsaX_fake.c b/legacy/dsaX_fake.c similarity index 96% rename from src/dsaX_fake.c rename to legacy/dsaX_fake.c index e68f19a..662ea37 100644 --- a/src/dsaX_fake.c +++ b/legacy/dsaX_fake.c @@ -175,7 +175,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -186,7 +186,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -235,7 +235,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t npackets = block_out / 4608; char * block, * output_buffer; @@ -257,7 +257,7 @@ int main (int argc, char *argv[]) { fread(packet,4608,1,fin); fclose(fin); - syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + syslog(LOG_INFO,"Read packet, npackets %lu",npackets); for (int i=0;idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_makeFil.c b/legacy/dsaX_makeFil.c similarity index 100% rename from src/dsaX_makeFil.c rename to legacy/dsaX_makeFil.c diff --git a/src/dsaX_merge.c b/legacy/dsaX_merge.c similarity index 98% rename from src/dsaX_merge.c rename to legacy/dsaX_merge.c index 0154b80..7866d5f 100644 --- a/src/dsaX_merge.c +++ b/legacy/dsaX_merge.c @@ -255,7 +255,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -266,7 +266,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -277,7 +277,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_in2 = dada_hdu_create (); + hdu_in2 = dada_hdu_create (0); dada_hdu_set_key (hdu_in2, in_key2); if (dada_hdu_connect (hdu_in2) < 0) { syslog (LOG_ERR,"could not connect to input buffer2"); @@ -455,7 +455,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block1, * block2, * o1, * o2; char * output = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_nicdb.c b/legacy/dsaX_nicdb.c similarity index 95% rename from src/dsaX_nicdb.c rename to legacy/dsaX_nicdb.c index 65cfdcc..df47ebe 100644 --- a/src/dsaX_nicdb.c +++ b/legacy/dsaX_nicdb.c @@ -369,7 +369,7 @@ int main(int argc, char ** argv) // DADA stuff - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -408,7 +408,7 @@ int main(int argc, char ** argv) // get block sizes and allocate memory uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out); + syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out); uint64_t bytes_read = 0; char *output1, *output2; output1 = (char *)malloc(sizeof(char)*block_out*bdepth); diff --git a/src/dsaX_nicdb.c.bak b/legacy/dsaX_nicdb.c.bak similarity index 100% rename from src/dsaX_nicdb.c.bak rename to legacy/dsaX_nicdb.c.bak diff --git a/src/dsaX_reorder.c b/legacy/dsaX_reorder.c similarity index 98% rename from src/dsaX_reorder.c rename to legacy/dsaX_reorder.c index ed0b440..04955da 100644 --- a/src/dsaX_reorder.c +++ b/legacy/dsaX_reorder.c @@ -369,7 +369,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -380,7 +380,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -435,7 +435,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_reorder_raw.c b/legacy/dsaX_reorder_raw.c similarity index 98% rename from src/dsaX_reorder_raw.c rename to legacy/dsaX_reorder_raw.c index d1a7ca3..c0f6b0c 100644 --- a/src/dsaX_reorder_raw.c +++ b/legacy/dsaX_reorder_raw.c @@ -28,6 +28,9 @@ #include "dada_def.h" #include "dada_hdu.h" #include "ipcio.h" +// Forward declaration to keep compiler happy +// Possible minor bug in PSRDada +int ipcio_check_pending_sod (ipcio_t* ); #include "ipcbuf.h" #include "dada_affinity.h" #include "ascii_header.h" @@ -391,7 +394,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -402,7 +405,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -414,7 +417,7 @@ int main (int argc, char *argv[]) { } if (bf) { - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer2"); @@ -501,7 +504,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_reorder_raw.c.bak b/legacy/dsaX_reorder_raw.c.bak similarity index 100% rename from src/dsaX_reorder_raw.c.bak rename to legacy/dsaX_reorder_raw.c.bak diff --git a/src/dsaX_reorder_raw.c.bak2 b/legacy/dsaX_reorder_raw.c.bak2 similarity index 100% rename from src/dsaX_reorder_raw.c.bak2 rename to legacy/dsaX_reorder_raw.c.bak2 diff --git a/src/dsaX_simplesplit.c b/legacy/dsaX_simplesplit.c similarity index 97% rename from src/dsaX_simplesplit.c rename to legacy/dsaX_simplesplit.c index fb41432..7a80c7e 100644 --- a/src/dsaX_simplesplit.c +++ b/legacy/dsaX_simplesplit.c @@ -193,7 +193,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -204,7 +204,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -216,7 +216,7 @@ int main (int argc, char *argv[]) { } if (bf) { - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer2"); @@ -298,7 +298,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * o1, * o2; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_splice.c b/legacy/dsaX_splice.c similarity index 100% rename from src/dsaX_splice.c rename to legacy/dsaX_splice.c diff --git a/src/dsaX_split.c b/legacy/dsaX_split.c similarity index 98% rename from src/dsaX_split.c rename to legacy/dsaX_split.c index d5724cd..1361e86 100644 --- a/src/dsaX_split.c +++ b/legacy/dsaX_split.c @@ -135,7 +135,7 @@ void calc_stats(char *input) { } for (int i=0;idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); uint64_t nints = block_size / block_out; - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * o1, * o2; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_splitup.c b/legacy/dsaX_splitup.c similarity index 97% rename from src/dsaX_splitup.c rename to legacy/dsaX_splitup.c index 3a9ab10..32f055d 100644 --- a/src/dsaX_splitup.c +++ b/legacy/dsaX_splitup.c @@ -160,7 +160,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -171,7 +171,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -220,7 +220,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t nsplits = block_size/block_out; char * block, * output_buffer; diff --git a/src/dsaX_store.c b/legacy/dsaX_store.c similarity index 95% rename from src/dsaX_store.c rename to legacy/dsaX_store.c index de53134..849c27c 100644 --- a/src/dsaX_store.c +++ b/legacy/dsaX_store.c @@ -112,7 +112,7 @@ int main (int argc, char *argv[]) { // open connection to the in/read DB - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to input buffer"); @@ -167,7 +167,7 @@ int main (int argc, char *argv[]) { char fnam[100]; - syslog(LOG_INFO, "have ngulps %d, blocksize %llu, bout %llu",ngulps,blocksize,bout); + syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout); // main reading loop @@ -202,7 +202,7 @@ int main (int argc, char *argv[]) { // for exiting if (bytes_read < blocksize) { observation_complete = 1; - syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu", bytes_read, blocksize); + syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize); } // close block for reading diff --git a/src/dsaX_testdada.c b/legacy/dsaX_testdada.c similarity index 99% rename from src/dsaX_testdada.c rename to legacy/dsaX_testdada.c index c12d704..bbe7640 100644 --- a/src/dsaX_testdada.c +++ b/legacy/dsaX_testdada.c @@ -114,7 +114,7 @@ int main (int argc, char *argv[]) { } // DADA stuff - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); dada_hdu_connect (hdu_in); diff --git a/src/dsaX_trigger.c b/legacy/dsaX_trigger.c similarity index 95% rename from src/dsaX_trigger.c rename to legacy/dsaX_trigger.c index 26342a4..9592389 100644 --- a/src/dsaX_trigger.c +++ b/legacy/dsaX_trigger.c @@ -186,11 +186,11 @@ void control_thread (void * arg) { //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); specnum = tmps; strcpy(footer_buf,tbuf); - syslog(LOG_INFO, "control_thread: received command to dump at %llu",specnum); + syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum); } if (dump_pending) - syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %llu",tmps); + syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps); if (!dump_pending) dump_pending = 1; @@ -341,7 +341,7 @@ int main (int argc, char *argv[]) { // open connection to the in/read DBs - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer"); @@ -352,7 +352,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output dada buffer"); @@ -525,9 +525,9 @@ int main (int argc, char *argv[]) { // DO writing using thread docopy = 1; - syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); + syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); ofile = fopen("/home/ubuntu/data/dumps.dat","a"); - fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); + fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); fclose(ofile); dumpnum++; @@ -539,7 +539,7 @@ int main (int argc, char *argv[]) { // if trigger arrived too late if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) { - syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum); + syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum); bytes_copied=0; dump_pending=0; @@ -550,7 +550,7 @@ int main (int argc, char *argv[]) { } // update current spec - syslog(LOG_INFO,"current_specnum %llu",current_specnum); + syslog(LOG_INFO,"current_specnum %lu",current_specnum); if (block_count < skips) { block_count++; } @@ -561,7 +561,7 @@ int main (int argc, char *argv[]) { // for exiting if (bytes_read < block_size) { observation_complete = 1; - syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size); + syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size); } // close block for reading diff --git a/src/dsaX_wrangle.c b/legacy/dsaX_wrangle.c similarity index 98% rename from src/dsaX_wrangle.c rename to legacy/dsaX_wrangle.c index 5825ec6..19507d4 100644 --- a/src/dsaX_wrangle.c +++ b/legacy/dsaX_wrangle.c @@ -217,7 +217,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -228,7 +228,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -277,7 +277,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block; uint64_t written, block_id; diff --git a/src/dsaX_wrangleAndWrite.c b/legacy/dsaX_wrangleAndWrite.c similarity index 100% rename from src/dsaX_wrangleAndWrite.c rename to legacy/dsaX_wrangleAndWrite.c diff --git a/src/dsaX_writeFil.c b/legacy/dsaX_writeFil.c similarity index 100% rename from src/dsaX_writeFil.c rename to legacy/dsaX_writeFil.c diff --git a/src/dsaX_writevis.c b/legacy/dsaX_writevis.c similarity index 100% rename from src/dsaX_writevis.c rename to legacy/dsaX_writevis.c diff --git a/src/dsaX_xgpu.cu b/legacy/dsaX_xgpu.cu similarity index 96% rename from src/dsaX_xgpu.cu rename to legacy/dsaX_xgpu.cu index a64217b..d065848 100644 --- a/src/dsaX_xgpu.cu +++ b/legacy/dsaX_xgpu.cu @@ -1,6 +1,8 @@ // -*- c++ -*- /* will run xgpu */ /* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -177,7 +179,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -188,7 +190,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -283,7 +285,8 @@ int main (int argc, char *argv[]) { cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice); promoter<<<6291456,32>>>(d_din,d_dout); - xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); xgpuClearDeviceIntegrationBuffer(&context); } @@ -315,7 +318,8 @@ int main (int argc, char *argv[]) { cudaDeviceSynchronize(); // run xgpu - xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); if(xgpu_error) { syslog(LOG_ERR, "xGPU error %d\n", xgpu_error); return EXIT_FAILURE; diff --git a/src/dumpfil.c b/legacy/dumpfil.c similarity index 98% rename from src/dumpfil.c rename to legacy/dumpfil.c index 0e658a5..0be913c 100644 --- a/src/dumpfil.c +++ b/legacy/dumpfil.c @@ -202,7 +202,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -236,7 +236,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input block size %llu\n",block_size); + syslog(LOG_INFO, "main: have input block size %lu\n",block_size); uint64_t bytes_read = 0; uint64_t npackets = 1; char * block, * output_buffer; diff --git a/src/fil2dada.c b/legacy/fil2dada.c similarity index 95% rename from src/fil2dada.c rename to legacy/fil2dada.c index c2235ec..c49f2b5 100644 --- a/src/fil2dada.c +++ b/legacy/fil2dada.c @@ -94,7 +94,9 @@ void get_string(FILE *inputfile, int *nbytes, char string[]) } */ -/*int read_header(FILE *inputfile) +int read_header(FILE *inputfile); +/* +int read_header(FILE *inputfile) { size_t nRead; char string[80], message[80]; @@ -353,7 +355,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -364,7 +366,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -413,7 +415,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t npackets = 1; char * block, * output_buffer; @@ -431,17 +433,19 @@ int main (int argc, char *argv[]) { syslog(LOG_ERR, "cannot open file - will write zeros"); } else { - - if (rhead) read_header(fin); -// fread(packet,block_out,1,fin); -// fclose(fin); -// syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + // DMH: FIXME + //if (rhead) read_header(fin); -// for (int i=0;i +#include +#include +#include + +int main() { + + cutlass::half_t x = 2.25_hf; + + std::cout << x << std::endl; + + return 0; +} +*/ + +#include +#include + +#include + +int main() { + + // Define the GEMM operation + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, // ElementA + cutlass::layout::ColumnMajor, // LayoutA + cutlass::half_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + cutlass::half_t, // ElementOutput + cutlass::layout::ColumnMajor, // LayoutOutput + float, // ElementAccumulator + cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores + cutlass::arch::Sm75 // tag indicating target GPU compute architecture + >; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::HostTensor A({M, K}); + cutlass::HostTensor B({K, N}); + cutlass::HostTensor C({M, N}); + + cutlass::half_t const *ptrA = A.device_data(); + cutlass::half_t const *ptrB = B.device_data(); + cutlass::half_t const *ptrC = C.device_data(); + cutlass::half_t *ptrD = C.device_data(); + + int lda = A.device_ref().stride(0); + int ldb = B.device_ref().stride(0); + int ldc = C.device_ref().stride(0); + int ldd = C.device_ref().stride(0); + // + // Launch GEMM on the device + // + + status = gemm_op({ + {M, N, K}, + {ptrA, lda}, // TensorRef to A device tensor + {ptrB, ldb}, // TensorRef to B device tensor + {ptrC, ldc}, // TensorRef to C device tensor + {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C + {alpha, beta} // epilogue operation arguments + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } else { + std::cout << "CUTLASS Success! " << std::endl; + } + + return 0; +} diff --git a/src/spectrometer_header.txt b/legacy/spectrometer_header.txt similarity index 100% rename from src/spectrometer_header.txt rename to legacy/spectrometer_header.txt diff --git a/src/splice_offline_beams.c b/legacy/splice_offline_beams.c similarity index 100% rename from src/splice_offline_beams.c rename to legacy/splice_offline_beams.c diff --git a/src/test_read.c b/legacy/test_read.c similarity index 99% rename from src/test_read.c rename to legacy/test_read.c index 0eefdc2..2b5730a 100644 --- a/src/test_read.c +++ b/legacy/test_read.c @@ -204,7 +204,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); diff --git a/src/test_write.c b/legacy/test_write.c similarity index 97% rename from src/test_write.c rename to legacy/test_write.c index b74e66b..32dd25d 100644 --- a/src/test_write.c +++ b/legacy/test_write.c @@ -28,6 +28,9 @@ #include "dada_def.h" #include "dada_hdu.h" #include "ipcio.h" +// Forward declaration to keep compiler happy +// Possible minor bug in PSRDada +int ipcio_check_pending_sod (ipcio_t* ); #include "ipcbuf.h" #include "dada_affinity.h" #include "ascii_header.h" @@ -261,7 +264,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -272,7 +275,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -358,7 +361,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/10_planar_complex.cu b/src/10_planar_complex.cu new file mode 100644 index 0000000..9e0915d --- /dev/null +++ b/src/10_planar_complex.cu @@ -0,0 +1,567 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex GEMM + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting + the batched strided mode. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile + size, and alignment. This can result in long compile times. + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn + + $ make 10_planar_complex + + $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "10_planar_complex example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/10_planar_complex/10_planar_complex --batch=7 --m=1024 --n=512 --k=1024 \\\n" + << " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + using ElementA = cutlass::half_t; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = cutlass::half_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = cutlass::half_t; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched strided GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + ElementA *ptr_A = tensor_A.get(); + ElementB *ptr_B = tensor_B.get(); + ElementC *ptr_C = tensor_C.get(); + ElementC *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMMs + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex( + problem_size.m(), // GEMM M dimension + problem_size.n(), // GEMM N dimension + problem_size.k(), // GEMM K dimension + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + ptr_A, // Pointer to real part of A matrix + ptr_A + imag_stride_A, // Pointer to imaginary part of A matrix + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + ptr_B, // Pointer to real part of B matrix + ptr_B + imag_stride_B, // Pointer to imaginary part of B matrix + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C, // Pointer to real part of C matrix + ptr_C + imag_stride_C, // Pointer to imaginary part of C matrix + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D, // Pointer to real part of D matrix + ptr_D + imag_stride_D, // Pointer to imaginary part of D matrix + ldd, // Leading dimension of real part of D matrix + ldd, // Leading dimension of imaginary part of D matrix + + batch_count, // Number of batched elements + + batch_stride_A, // Stride between batches of real parts of A matrix + batch_stride_A, // Stride between batches of imaginary parts of A matrix + + batch_stride_B, // Stride between batches of real parts of B matrix + batch_stride_B, // Stride between batches of imaginary parts of B matrix + + batch_stride_C, // Stride between batches of real parts of C matrix + batch_stride_C, // Stride between batches of imaginary parts of C matrix + + batch_stride_D, // Stride between batches of real parts of D matrix + batch_stride_D // Stride between batches of imaginary parts of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMMs are complete + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + ElementC epsilon = 0.1_hf; + ElementC nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu new file mode 100644 index 0000000..94dcc55 --- /dev/null +++ b/src/11_planar_complex_array.cu @@ -0,0 +1,627 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex Array Example + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which + execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays + in global memory. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn + + $ make 11_planar_complex_array + + $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "11_planar_complex_array example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + // Half-precision input and output + using Element = cutlass::half_t; + + // Configurations for layouts and internal computation + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + cutlass::DeviceAllocation ptr_A_real; + cutlass::DeviceAllocation ptr_A_imag; + cutlass::DeviceAllocation ptr_B_real; + cutlass::DeviceAllocation ptr_B_imag; + cutlass::DeviceAllocation ptr_C_real; + cutlass::DeviceAllocation ptr_C_imag; + cutlass::DeviceAllocation ptr_D_real; + cutlass::DeviceAllocation ptr_D_imag; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); + + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + Element *ptr_A = tensor_A.get(); + Element *ptr_B = tensor_B.get(); + Element *ptr_C = tensor_C.get(); + Element *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Configure pointers in global memory + // + + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = { + { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} + }; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + cudaError_t error = cudaMemcpy( + tensor.ptr_real + idx, + &ptr_real, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + + error = cudaMemcpy( + tensor.ptr_imag + idx, + &ptr_imag, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + } + } + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex_array( + + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + Element, LayoutA, + Element, LayoutB, + Element, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..de05a16 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,120 @@ +include_directories(${CMAKE_SOURCE_DIR}/include) + +if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) + add_compile_definitions(DSA_XENGINE_TARGET_CUDA) +endif() + +if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU) + add_compile_definitions(DSA_XENGINE_TARGET_CPU) +endif() + +# DSA Fast Time Domain library +#----------------------------- +set(DSAX_OBJS + cuda_interface.cu + cublas_interface.cu + malloc.cu + dsaX_ptr.cpp + cuda_handles.cu + magma_interface.cu + blas_interface.cpp + beamformer.cpp + dsaX_base.cpp + + correlator.cpp + interface.cpp + utils.cpp + params.cpp + psrdada_utils.cpp + ) + +# split source into cu and cpp files +foreach(item ${DSAX_OBJS}) + string(REGEX MATCH ".+\\.cu$" item_match ${item}) + if(item_match) + list(APPEND DSAX_CU_OBJS ${item}) + endif(item_match) +endforeach(item ${DSAX_OBJS}) + +list(REMOVE_ITEM DSAX_OBJS ${DSAX_CU_OBJS}) + +# DSAX_CU_OBJS should contain all cuda files now and DSAX_OBJS all cpp. +# If we have a git version, make version.cpp depend on git head so that it is +# rebuilt if the git sha changed +if(GITVERSION) + find_path( + DSAX_GITDIR NAME HEAD + PATHS ${CMAKE_SOURCE_DIR}/.git/logs + NO_DEFAULT_PATH) + include(AddFileDependencies) + if(DSAX_GITDIR) + add_file_dependencies(version.cpp ${DSAX_GITDIR}/HEAD) + endif() +endif() +mark_as_advanced(DSAX_GITDIR) + +# generate a cmake object library for all cpp files first +add_library(dsax_cpp OBJECT ${DSAX_OBJS}) + +if(DSA_XENGINE_BUILD_SHAREDLIB) + set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + add_library(dsaX SHARED) +else() + add_library(dsaX STATIC) +endif() +add_library(DSA_XENGINE::dsaX ALIAS dsaX) + +# make one library +target_sources(dsaX PRIVATE $ ${DSAX_CU_OBJS}) + +if(CUDAToolkit_FOUND) + target_link_libraries(dsaX INTERFACE CUDA::cuda_driver CUDA::cudart_static ${CUDA_cublas_LIBRARY}) +endif() + +if(DSA_XENGINE_ENABLE_PSRDADA) + include_directories(${PSRDada_SOURCE_DIR}/src) + set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) + target_link_libraries(dsaX PUBLIC ${PSRDada_LIB}) +endif() + +if(DSA_XENGINE_ENABLE_XGPU) + include_directories(${xGPU_SOURCE_DIR}/src) + set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + target_link_libraries(dsaX PUBLIC ${XGPU_LIB}) +endif() + +if(DSA_XENGINE_ENABLE_CUTLASS) + include_directories(${NvidiaCutlass_DIR}/../../../include) + include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) + set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) + target_link_libraries(dsaX PUBLIC ${NvidiaCutlass_LIB}) + + # Some simple CUTLASS examples to test linking/benching + #------------------------------------------------------ + add_executable(planar_complex planar_complex.cu) + target_link_libraries(planar_complex ${NvidiaCutlass_LIB}) + + add_executable(10_planar_complex 10_planar_complex.cu) + target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB}) + + add_executable(11_planar_complex_array 11_planar_complex_array.cu) + target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB}) + #------------------------------------------------------ + + # DSA Fast Time Domain CUTLASS interface + #--------------------------------------- + add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu) + target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB}) + #--------------------------------------- +endif() +#--------------------- + +# install step for libraray +#----------------------------- +install(TARGETS + # cmake-format: sortable + dsaX + LIBRARY DESTINATION + lib + ) +#----------------------------- diff --git a/src/beamformer.cpp b/src/beamformer.cpp new file mode 100644 index 0000000..e99a54c --- /dev/null +++ b/src/beamformer.cpp @@ -0,0 +1,120 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ + +#include +#include + +#include "dsaX_def.h" +#include "dsaX.h" +#include "blas_interface.h" +#include "utils.h" +#include "psrdada_utils.h" + +using namespace std; + +/* +Beamformer: + - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] +(single transpose operation) + - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 + - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) + - transpose and done! + +*/ +// beamformer function +void dbeamformer(bf_handle *d) { + + dsaXBLASParam blas_param; + blas_param.trans_a = DSA_BLAS_OP_T; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NPACKETS_PER_BLOCK/4; + blas_param.n = NBEAMS/2; + blas_param.k = 4*(NANTS/2)*8*2*2; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.k; + blas_param.ldb = blas_param.k; + blas_param.beta = 0.0; + blas_param.ldc = blas_param.m; + blas_param.a_stride = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; + blas_param.b_stride = (NBEAMS/2)*4*(NANTS/2)*8*2*2; + blas_param.c_stride = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; + blas_param.batch_count = NCHAN_PER_PACKET/8; + + long long int i1, i2; + + // timing + // copy, prepare, cublas, output + clock_t begin, end; + + // do big memcpy + begin = clock(); + dsaXmemcpy(d->d_big_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4, dsaXMemcpyHostToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // loop over halves of the array + for (int iArm=0;iArm<2;iArm++) { + + // zero out output arrays + dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short)); + dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short)); + dsaXDeviceSynchronize(); + + // copy data to device + // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + // final data: need to split by NANTS. + begin = clock(); + for (i1=0; i1d_input + i1*(NANTS/2)*NCHAN_PER_PACKET*4, + d->d_big_input + i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4, + (NANTS/2)*NCHAN_PER_PACKET*4, dsaXMemcpyDeviceToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // do reorder and fluff of data to real and imag + begin = clock(); + + // DMH: Abstract the launch parameters + std::vector dimBlock = {16, 8}; + std::vector dimGrid = {NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16}; + transposeInputBeamformer((double *)(d->d_input), (double *)(d->d_tx), dimBlock, dimGrid); + + int blocks = NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128; + int tpb = 128; + fluffInputBeamformer(d->d_tx, d->d_br, d->d_bi, blocks, tpb); + end = clock(); + d->prep += (float)(end - begin) / CLOCKS_PER_SEC; + + // set up for gemm + i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset + blas_param.b_offset = i2; + // large matrix multiply to get real and imag outputs + begin = clock(); + dsaXHgemmStridedBatched(d->d_br, d->d_bi, d->weights_r, d->weights_i, d->d_bigbeam_r, d->d_bigbeam_i, blas_param); + end = clock(); + d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; + + // simple formation of total power and scaling to 8-bit in transpose kernel + // Reuse dimBlock + //DMH: Abstract kernel launch parameters + dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16; + dimGrid[1] = (NCHAN_PER_PACKET/8)/16; + begin = clock(); + transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid); + end = clock(); + d->outp += (float)(end - begin) / CLOCKS_PER_SEC; + } + + // form sum over times + int blocks = 24576; + int tpb = 512; + // COMMENT OUT WHEN DONE!!! + //sumBeam(d->d_bigpower, d->d_chscf, blocks, tpb); +} diff --git a/src/blas_interface.cpp b/src/blas_interface.cpp new file mode 100644 index 0000000..ed76f05 --- /dev/null +++ b/src/blas_interface.cpp @@ -0,0 +1,28 @@ +#include + +#include "dsaX.h" +#include "cublas_interface.h" +#include "magma_interface.h" + +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) { + switch (param.blas_lib) { + case DSA_BLAS_LIB_CUBLAS: + dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); + break; + case DSA_BLAS_LIB_MAGMA: + //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); + break; + case DSA_BLAS_LIB_CUTLASS: + //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_OPENBLAS: + //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_TCC: + //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + default: + std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl; + exit(0); + } +} diff --git a/src/correlator.cpp b/src/correlator.cpp new file mode 100644 index 0000000..2662e58 --- /dev/null +++ b/src/correlator.cpp @@ -0,0 +1,285 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ + +#include +#include + +#include "dsaX_def.h" +#include "dsaX.h" +#include "fast_time_domain.h" +#include "blas_interface.h" +#include "utils.h" +#include "psrdada_utils.h" + +using namespace std; + +Correlator::Correlator(const dsaXCorrParam *param) { + + // Transfer passed param to internal objects + corr_param = *param; + d.corr_param = *param; + + // Select back end BLAS engine + blas_param.struct_size = sizeof(blas_param); + blas_param.blas_type = DSA_BLAS_GEMM; + blas_param.blas_lib = corr_param.blas_lib; + + // Streams will be class specific + // so launch and destroy in the class + initStreams(corr_param.n_streams); + + // Initialise device memeory + d.dev_malloc_timer.start(); + initDsaXCorrDeviceMemory(&d, corr_param.n_streams); + d.dev_malloc_timer.stop(); + + // Compute indices + computeIndices(&d); + + // gemm settings + // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] +#if defined OLD_BLAS + //cout << "Old params" << endl; + blas_param.data_order = DSA_BLAS_DATAORDER_COL; + blas_param.trans_a = DSA_BLAS_OP_A; + blas_param.trans_b = DSA_BLAS_OP_T; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#else + //cout << "My params" << endl; + blas_param.data_order = DSA_BLAS_DATAORDER_ROW; + blas_param.trans_a = DSA_BLAS_OP_C; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#endif + + // Swap A and B if in row order + if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { + swap(blas_param.m, blas_param.n); + swap(blas_param.lda, blas_param.ldb); + swap(blas_param.trans_a, blas_param.trans_b); + swap(blas_param.a_offset, blas_param.b_offset); + swap(blas_param.a_stride, blas_param.b_stride); + //swap(A_data, B_data); + //swap(A_data, B_data); + } + + printDsaXBLASParam(blas_param); + + flops = 8; // 8 complex flops per element + flops *= blas_param.m; + flops *= blas_param.n; + flops *= blas_param.k; + flops *= blas_param.batch_count; + + cout << "Correlator flops = 2*M*N*K * batch = (" << 2 << "*"<< blas_param.m << "*" << blas_param.n << "*" << blas_param.k << "*" << blas_param.batch_count << ") = " << flops << endl; + cout << "Correlator Gflop = " << (1e-9)*flops << endl; + + // DMH: reset counters method + +} + +Correlator::~Correlator() { + + // Clean up memory + destroyDsaXCorrDeviceMemory(&d); + destroyStreams(); + + // Transfer metrics to + double device_malloc_time = (1.0*d.dev_malloc_timer.elapsed().count())/(1e6); + double host_malloc_time = (1.0*d.host_malloc_timer.elapsed().count())/(1e6); + double device_compute_time = (1.0*d.dev_compute_timer.elapsed().count())/(1e6); + cout << "Correlator malloc time device = " << device_malloc_time << " seconds." << endl; + cout << "Correlator malloc time host = " << host_malloc_time << " seconds." << endl; + cout << "Correlator compute time device = " << device_compute_time << " seconds. " << endl; + + double h2d_time = (1.0*d.H2D_timer.elapsed().count())/(1e6); + cout << "Correlator H2D time = " << h2d_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.H2D_bytes)/pow(1024,3) / h2d_time << " Gbytes/second." << endl; + + double d2h_time = (1.0*d.D2H_timer.elapsed().count())/(1e6); + cout << "Correlator D2H time = " << d2h_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.D2H_bytes)/pow(1024,3) / d2h_time << " Gbytes/second." << endl; + + double h2h_time = (1.0*d.H2H_timer.elapsed().count())/(1e6); + cout << "Correlator H2H time = " << h2h_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.H2H_bytes)/pow(1024,3) / h2h_time << " Gbytes/second." << endl; + + double total = device_malloc_time + host_malloc_time + device_compute_time + h2d_time + d2h_time; + cout << "Correlator TOTAL time = " << total << " seconds. " << endl; + + double Tflops = (1.0*d.dev_compute_timer.iterations()*(1e-12*flops)/device_compute_time); + cout << "Correlator Tflops = " << Tflops << endl; +} + +void Correlator::compute(void *output, void *input) { + + uint64_t in_stream_block = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + uint64_t out_stream_block = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2; + + unsigned int n_streams = corr_param.n_streams; + + // Ensure output array is zero + dsaXmemset(d.d_output, 0, n_streams * out_stream_block); + + // Loop over the array in streams for concurrency. + for(int i=0; id_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short + dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short + dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + + // copy to device + dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); + + // reorder input into real and imaginary arrays of 2 byte data + reorderCorrInput(d, 0); + + dsaXBLASParam blas_param; + blas_param.struct_size = sizeof(blas_param); + blas_param.blas_type = DSA_BLAS_GEMM; + + // gemm settings + // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] + +#if defined OLD_BLAS + //cout << "Old params" << endl; + + blas_param.data_order = DSA_BLAS_DATAORDER_COL; + blas_param.trans_a = DSA_BLAS_OP_A; + blas_param.trans_b = DSA_BLAS_OP_T; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#else + //cout << "My params" << endl; + + blas_param.data_order = DSA_BLAS_DATAORDER_ROW; + blas_param.trans_a = DSA_BLAS_OP_C; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#endif + + // Swap A and B if in row order + if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { + swap(blas_param.m, blas_param.n); + swap(blas_param.lda, blas_param.ldb); + swap(blas_param.trans_a, blas_param.trans_b); + swap(blas_param.a_offset, blas_param.b_offset); + swap(blas_param.a_stride, blas_param.b_stride); + //swap(A_data, B_data); + //swap(A_data, B_data); + } + + + //printDsaXBLASParam(blas_param); + + // DMH: fix me + blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS; + + // Perform GEMM accoring to back end configuration + dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); + + //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i); + + // reorder output data + reorderCorrOutput(d); +} +*/ diff --git a/src/cublas_interface.cu b/src/cublas_interface.cu new file mode 100644 index 0000000..234e18a --- /dev/null +++ b/src/cublas_interface.cu @@ -0,0 +1,190 @@ +#include + +#include "dsaX.h" +#include "params.h" +#include "cuda_headers.h" +#include "cuda_handles.h" +//#include "dsaX_cuda_kernels.h" // For debug + +using namespace std; + +__global__ void deviceInspectHalf(half *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUBLAS[%d]: device inspect half [%d] = %f\n", stage, x, __half2float(input[x])); +} + +void init_cublas_local() { + if (!cublas_init) { + //cublasError_t error = cudaStreamCreate(streams); + cublasStatus_t error = cublasCreate(&cublasH); + //cublasSetStream(handle, stream); + //cublasStatus_t error = cublasCreate(&handle); + if (error != CUBLAS_STATUS_SUCCESS) + cout << "cublasCreate failed with error " << error << endl; + else + cout << "cublasCreated successfully." << endl; + cublas_init = true; + } +} + +void destroy_cublas_local() { + if(cublas_init) + cublasDestroy(cublasH); + cublas_init = false; +} + +void initBLASCuda() { + init_cublas_local(); +} + +using namespace std; + +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param, int stream) { +#ifdef DSA_XENGINE_TARGET_CUDA + + // not sure if essential + //cudaDeviceSynchronize(); + + cublasSetStream(cublasH, get_stream(stream)); + + bool verbose = false; + + // Set up for gemm + //---------------- + // Transfer params + const int m = blas_param.m; + const int n = blas_param.n; + const int k = blas_param.k; + const double alpha = blas_param.alpha.real(); + const int lda = blas_param.lda; + const int ldb = blas_param.ldb; + const half beta0 = blas_param.beta.real(); + const half beta1 = 1.0; + const int ldc = blas_param.ldc; + const long long int a_offset = blas_param.a_offset; + const long long int b_offset = blas_param.b_offset; + const long long int c_offset = blas_param.c_offset; + const long long int strideA = blas_param.a_stride; + const long long int strideB = blas_param.b_stride; + const long long int strideC = blas_param.c_stride; + const int batchCount = blas_param.batch_count; + + // NOTE: cublasHgemm is a real valued kernel. As a result, + // matrix conjugates must be handled by passing negative + // alpha values on the appropriate imaginary planar + // arrays. We discern these negative values while parsing + // transpose, adjoint and conjugation values. + cublasOperation_t transa; + cublasOperation_t transb; + int A_imag_alpha_sign = 1.0; + switch (blas_param.trans_a) { + case DSA_BLAS_OP_N: + transa = CUBLAS_OP_N; + break; + case DSA_BLAS_OP_T: + transa = CUBLAS_OP_T; + break; + case DSA_BLAS_OP_A: + transa = CUBLAS_OP_N; + // A array requests adjoint, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of A. + A_imag_alpha_sign *= -1; + break; + case DSA_BLAS_OP_C: + transa = CUBLAS_OP_T; + // A array requests conjugation, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of A. + A_imag_alpha_sign *= -1; + break; + default: + std::cout << "Unknown cublas transpose" << std::endl; + } + + int B_imag_alpha_sign = alpha; + switch (blas_param.trans_b) { + case DSA_BLAS_OP_N: + transb = CUBLAS_OP_N; + break; + case DSA_BLAS_OP_T: + transb = CUBLAS_OP_T; + break; + case DSA_BLAS_OP_A: + transb = CUBLAS_OP_N; + // B array requests adjoint, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of B. + B_imag_alpha_sign *= -1; + break; + case DSA_BLAS_OP_C: + transb = CUBLAS_OP_T; + // A array requests conjugation, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of B. + B_imag_alpha_sign *= -1; + break; + default: + std::cout << "Unknown dsaBLAS transpose" << std::endl; + } + + // Run strided batched gemm for datatype + // (a + ib)(c + id) = (ac - bd) + i(bc + ad) + // on matrices C = alpha * op(A) * op(B) + beta * C + // where op(M) is defined by the transposition variable + // cublasOperation_t transM + + //deviceInspectHalf<<<1, 8>>>((half *)real_a); + + // Accumulate results into C matrix + // ac + half alpha_ac = alpha; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ac), + (half *)real_a + a_offset, lda, strideA, + (half *)real_b + b_offset, ldb, strideB, &beta0, + (half *)real_c + c_offset, ldc, strideC, + batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 0); + + // -bd (minus sign from i*i) + half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign); + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd), + (half*)imag_a + a_offset, lda, strideA, + (half*)imag_b + b_offset, ldb, strideB, &beta1, + (half*)real_c + c_offset, ldc, strideC, + batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 1); + + // bc + half alpha_bc = alpha * A_imag_alpha_sign; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc), + (half*)imag_a + a_offset, lda, strideA, + (half*)real_b + b_offset, ldb, strideB, &beta0, + (half*)imag_c + c_offset, ldc, strideC, + batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 2); + + // ad + half alpha_ad = alpha * B_imag_alpha_sign; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad), + (half*)real_a + a_offset, lda, strideA, + (half*)imag_b + b_offset, ldb, strideB, &beta1, + (half*)imag_c + c_offset, ldc, strideC, + batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 3); + + // shown to be essential (only with streams, fix me) + //cudaDeviceSynchronize(); +#else + std::cout "dsaX not built with CUDA target." << std::endl; + exit(0); +#endif +} diff --git a/src/cuda_correlator b/src/cuda_correlator deleted file mode 100755 index a8b94c7..0000000 Binary files a/src/cuda_correlator and /dev/null differ diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu new file mode 100644 index 0000000..1b756d0 --- /dev/null +++ b/src/cuda_handles.cu @@ -0,0 +1,64 @@ +#include +#include +#include + +using namespace std; + +#ifdef DSA_XENGINE_TARGET_CUDA + +// CUDA stream handler functions +//------------------------- +void init_streams(unsigned int n_streams) { + + //if(n_streams < 2 || n_streams > 9) { + //cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl; + //exit(0); + //} + + if(!stream_init) { + streams.reserve(n_streams); + for (auto &s : streams) cudaStreamCreate(&s); + /* + int greatestPriority; + int leastPriority; + + // Query the device to get its built in priority range + // For CUDA, lower numerical values indicate higher priority + cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority); + for (int i=0; i +#include + +#include "cuda_headers.h" +#include "cuda_interface.h" +#include "cuda_kernels.h" +#include "cuda_handles.h" +// DMH: Everything in this file is CUDA aware. + +//#include "dsaX_malloc.h" +#include "dsaX_ptr.h" + +using namespace std; + +__global__ void deviceInspectHalfCI(half *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUDA_INTERFACE[%d]: device inspect half [%d] = %f\n", stage, x, __half2float(input[x])); +} + +__global__ void deviceInspectFloatCI(float *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUDA_INTERFACE[%d]: device inspect float [%d] = %f\n", stage, x, input[x]); +} + +void dsaXInitCuda(int dev){ + if(dev >= 0) cudaSetDevice(dev); + else { + cout << "dsaX Error: invalid device ordinal " << dev << " passed to dsaX." << endl; + exit(0); + } +} + +void initStreamsCuda(unsigned int n_streams){ + init_streams(n_streams); +} + +void destroyStreamsCuda(){ + destroy_streams(); +} + +void dsaXDestroyCuda(){ + cudaDeviceReset(); +} + +void *dsaXHostRegisterCuda(size_t size) { + + void *ptr = malloc(size); + cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault); + if (err != cudaSuccess) { + cout << "dsaX Error: Failed to register pinned memory of size " << size << endl; + exit(0); + } + return ptr; +} + +// allocate device memory +void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) { + + // for correlator + + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + //dsaX_ptr ptr = dsaX_ptr(DSA_MEMORY_DEVICE, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams, true); + + //cout << &ptr << endl; + + //d->d_input = + + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + //cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + //cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + //cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + + // Total device memeory + uint64_t mem_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(int)*NBASE; + + cout << "mem_size = " << mem_size/pow(1024,3) << " GB" << endl; + //exit(0); + // DMH: fix me + cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE); +} + +void initializeBFCudaMemory(bf_handle *d, int n_streams) { + + // for beamformer + cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams); + cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams); + cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); + cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); + cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams); + cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams); + cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)*n_streams); + cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)*n_streams); // beam scale factor + cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); // beam scale factor + + // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] + d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); + d->flagants = (int *)malloc(sizeof(int)*NANTS); + d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); + + // timers + d->cp = 0.; + d->prep = 0.; + d->outp = 0.; + d->cubl = 0.; +} + +// deallocate device memory +void deallocateCorrCudaMemory(corr_handle *d) { + + cudaFree(d->d_input); + cudaFree(d->d_r); + cudaFree(d->d_i); + cudaFree(d->d_tx); + cudaFree(d->d_output); + cudaFree(d->d_outr); + cudaFree(d->d_outi); + cudaFree(d->d_tx_outr); + cudaFree(d->d_tx_outi); + cudaFree(d->d_idxs); +} + +// deallocate device memory +void deallocateBFCudaMemory(bf_handle *d) { + + cudaFree(d->d_input); + cudaFree(d->d_tx); + cudaFree(d->d_br); + cudaFree(d->d_bi); + cudaFree(d->weights_r); + cudaFree(d->weights_i); + cudaFree(d->d_bigbeam_r); + cudaFree(d->d_bigbeam_i); + cudaFree(d->d_bigpower); + cudaFree(d->d_scf); + cudaFree(d->d_chscf); + free(d->h_winp); + free(d->flagants); + cudaFree(d->d_freqs); + free(d->h_freqs); +} + +void computeIndicesCuda(corr_handle *d) { + + // now run kernel to sum into output + int *h_idxs = (int *)malloc(sizeof(int)*NBASE); + int ii = 0; + // upper triangular order (column major) to match xGPU (not the same as CASA!) + for (int i=0; id_idxs, h_idxs, sizeof(int)*NBASE, cudaMemcpyHostToDevice); + free(h_idxs); +} + + +// function to copy d_outr and d_outi to d_output +// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS] +// the corr matrices are column major order +// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] +// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel +void reorderCorrOutputCuda(corr_handle *d, int stream) { + + cudaStream_t str = get_stream(stream); + + uint64_t input_offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + uint64_t output_offset = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2 * stream; + + // transpose input data +#if defined (OLD_BLAS) + dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32); + transpose_matrix_float<<>>((half*)d->d_outr, (half*)d->d_tx_outr); + transpose_matrix_float<<>>((half*)d->d_outi, (half*)d->d_tx_outi); +#endif + + // run kernel to finish things + // TUNABLE + int blockDim = 128; + int blocks = NCHAN_PER_PACKET*2*NBASE/blockDim; +#if defined (OLD_BLAS) + corr_output_copy<<>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, (int*)d->d_idxs); +#else + corr_output_copy<<>>((half*)d->d_outr + input_offset, (half*)d->d_outi + input_offset, d->d_output + output_offset, (int*)d->d_idxs); +#endif + //deviceInspectHalfCI<<<1,8>>>((half*)d->d_outi, 0); +} + + + + +// function to copy and reorder d_input to d_r and d_i +// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] +// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] +// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. +// then fluffs using simple kernel +void reorderCorrInputCuda(corr_handle *d, int stream) { + + // DMH: globalise me + int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + + cudaStream_t str = get_stream(stream); + + // TUNABLE + int blockDim = 128; + int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; + + // transpose input data +#if defined (OLD_BLAS) + dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); + + transpose_matrix_char<<>>((char*)d->d_input + offset, (char*)d->d_tx + offset); + + // DMH: These two can run concurrently + promoteComplexCharToPlanarHalf<<>>((char*)d->d_tx + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); +#else + promoteComplexCharToPlanarHalf<<>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); +#endif +} + +void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream) { + + // DMH: globalise me + int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + + cudaStream_t str = get_stream(stream); + + // TUNABLE + int blockDim = 128; + int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; + + promoteComplexCharToPlanarHalf<<>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); +} + +// kernels to reorder and fluff input data for beamformer +// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] +// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex] +// run as 16x16 tiled transpose with 32-byte words +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16) +// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index +// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); +void transposeInputBeamformerCuda(double *idata, double *odata, std::vector &dim_block_in, + std::vector &dim_grid_in) { + + // Create CUDA objects for launch + dim3 dim_block(dim_block_in[0], dim_block_in[1]); + dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]); + + // Launch kernel + transpose_input_beamformer<<>>(idata, odata); +} + + +// GPU-powered function to populate weights matrix for beamformer +// file format: +// sequential pairs of eastings and northings +// then [NANTS, 48, R/I] calibs + +void calcWeightsCuda(bf_handle *d) { + + // allocate + float *antpos_e = (float *)malloc(sizeof(float)*NANTS); + float *antpos_n = (float *)malloc(sizeof(float)*NANTS); + float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); + float *d_antpos_e, *d_antpos_n, *d_calibs; + float wnorm; + cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS); + cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS); + cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); + + // deal with antpos and calibs + //int iant; + //int found; + for (int i=0;ih_winp[2*i]; + antpos_n[i] = d->h_winp[2*i+1]; + } + for (int i=0;inflags;j++) + //if (d->flagants[j]==iant) found = 1; + + calibs[2*i] = d->h_winp[2*NANTS+2*i]; + calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1]; + + wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]); + if (wnorm!=0.0) { + calibs[2*i] /= wnorm; + calibs[2*i+1] /= wnorm; + } + + //if (found==1) { + //calibs[2*i] = 0.; + //calibs[2*i+1] = 0.; + //} + } + + //for (int i=0;i>>(d_antpos_e, d_antpos_n, d_calibs, (half*)d->weights_r, (half*)d->weights_i, d->d_freqs); + + // free stuff + cudaFree(d_antpos_e); + cudaFree(d_antpos_n); + cudaFree(d_calibs); + free(antpos_e); + free(antpos_n); + free(calibs); + +} + +// kernel to fluff input bf data +// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads +void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb) { + + // Launch kernel + fluff_input_beamformer<<>>(input, (half*)b_real, (half*)b_imag); +} + +// transpose, add and scale kernel for bf +// assume breakdown into tiles of 16x16, and run with 16x8 threads per block +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) +// scf is a per-beam scale factor to enable recasting as unsigned char +void transposeScaleBeamformerCuda(void *ir, void *ii, unsigned char *odata, std::vector &dim_block_in, + std::vector &dim_grid_in) { + + // Create CUDA objects for launch + dim3 dim_block(dim_block_in[0], dim_block_in[1]); + dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]); + + // Launch kernel + transpose_scale_beamformer<<>>((half*)ir, (half*)ii, odata); +} + +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) { + + // Launch kernel + sum_beam<<>>(input, output); +} + +// CUDA API wrappers +// DMH: Wrap all these calls around a CHECK_ERROR to save on +// lines of code +void dsaXDeviceSynchronizeCuda() { + + cudaError error = cudaSuccess; + cudaDeviceSynchronize(); + if(error != cudaSuccess) { + cudaGetLastError(); + exit(0); + } +} + +void dsaXmemsetCuda(void *array, int ch, size_t n){ + + cudaError error = cudaSuccess; + error = cudaMemset(array, ch, n); + if(error != cudaSuccess) { + cudaGetLastError(); + exit(0); + } + +} + +void dsaXmallocCuda(void *array, size_t array_length){ + + // for correlator + //cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + //cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + //cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + +} + +void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){ + + cudaError error = cudaSuccess; + cudaStream_t str = get_stream(stream); + + cout << "kind = " << dsaXMemcpyHostToHost << endl; + + switch(kind) { + case dsaXMemcpyHostToHost: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost); + break; + case dsaXMemcpyHostToDevice: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice); + break; + case dsaXMemcpyDeviceToHost: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost); + break; + case dsaXMemcpyDeviceToDevice: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice); + break; + case dsaXMemcpyHostToHostAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToHost, str); + break; + case dsaXMemcpyHostToDeviceAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToDevice, str); + break; + case dsaXMemcpyDeviceToHostAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToHost, str); + break; + case dsaXMemcpyDeviceToDeviceAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToDevice, str); + break; + default: + std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl; + } + + if(error != cudaSuccess) { + const char *string = cudaGetErrorString(error); + //cudaGetLastError(); + //cudaGetErrorString(&string); + printf("dsaXmemcpyCuda failed with error %s\n", string); + exit(0); + } +} diff --git a/src/cutlass_interface.cu b/src/cutlass_interface.cu new file mode 100644 index 0000000..fc68d55 --- /dev/null +++ b/src/cutlass_interface.cu @@ -0,0 +1,315 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "dsaX_cutlass_interface.h" + +DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); +} + +// DMH: Replace this with data from DSA-FTD +void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { + + if(testing) { + uint64_t seed = 1234; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } else { + // DMH: construct DSA-FTD interface data transfer interface + } + + ptr_A = tensor_A.get(); + ptr_B = tensor_B.get(); + ptr_C = tensor_C.get(); + ptr_D = tensor_D.get(); + + batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + +} + +Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { + + Result result; + + initialize(); + + // Configure pointers in global memory + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + cudaError_t error; + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + } + } + + + cudaEvent_t events[2]; + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Run profiling loop + //------------------- + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + for (int iter = 0; iter < options.iterations; ++iter) { + + result.status = handle.gemm_planar_complex_array( + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // Compute reference in device code + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + // Define the GEMM through templates + GemmPlanarComplex + (problem_size, options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = BlockCompareRelativelyEqual + ( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) std::cout << "Reference check passed." << std::endl; + else std::cerr << "Error - reference check failed." << std::endl; + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; +} + + int main(int argc, char const **args) { + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + Options options; + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Compute GEMM + DSA_FTD_ComplexGEMM_CUTLASS gemm(options); + gemm.testing = true; + Result result = gemm.run(options); + + return result.passed ? 0 : -1; +} + diff --git a/src/dsaX_api.cu b/src/dsaX_api.cu new file mode 100644 index 0000000..8f26a49 --- /dev/null +++ b/src/dsaX_api.cu @@ -0,0 +1,43 @@ + + + +void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, + const char *line) + { + if (count == 0) return; + QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); + } + + +void dsaMemcpyAsync_(void *dst, const void *src, size_t count, dsaMemcpyKind kind, const qudaStream_t &stream, + const char *func, const char *file, const char *line) + { + if (count == 0) return; + + if (kind == qudaMemcpyDeviceToDevice) { + QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), stream, true, func, file, line); + } else { +#ifdef USE_DRIVER_API + switch (kind) { + case qudaMemcpyDeviceToHost: + PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_D2H_ASYNC); + break; + case qudaMemcpyHostToDevice: + PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_H2D_ASYNC); + break; + case qudaMemcpyDeviceToDevice: + PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)), + QUDA_PROFILE_MEMCPY_D2D_ASYNC); + break; + case qudaMemcpyDefault: + PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)), + QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC); + break; + default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind); + } +#else + PROFILE(cudaMemcpyAsync(dst, src, count, qudaMemcpyKindToAPI(kind), get_stream(stream)), + kind == qudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC); +#endif + } + } diff --git a/src/dsaX_base.cpp b/src/dsaX_base.cpp new file mode 100644 index 0000000..80a947a --- /dev/null +++ b/src/dsaX_base.cpp @@ -0,0 +1,9 @@ +#include "fast_time_domain.h" + +dsaXBase::dsaXBase() { + +} + +dsaXBase::~dsaXBase() { + +} diff --git a/src/dsaX_beamformer_passon b/src/dsaX_beamformer_passon deleted file mode 100755 index b08ed99..0000000 Binary files a/src/dsaX_beamformer_passon and /dev/null differ diff --git a/src/dsaX_ptr.cpp b/src/dsaX_ptr.cpp new file mode 100644 index 0000000..702654d --- /dev/null +++ b/src/dsaX_ptr.cpp @@ -0,0 +1,155 @@ +#include +#include "dsaX_ptr.h" + +dsaX_ptr::dsaX_ptr(dsaXMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool) { + if (pool && (type != DSA_MEMORY_DEVICE && type != DSA_MEMORY_HOST_PINNED && type != DSA_MEMORY_HOST)) { + printf("dsaX ERROR: Memory pool not available for memory type %d", type); + exit(0); + } + + if (size > 0) { + switch (type) { + case DSA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break; + case DSA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break; + case DSA_MEMORY_HOST: host = safe_malloc(size); break; + case DSA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break; + case DSA_MEMORY_MAPPED: + host = mapped_malloc(size); + device = get_mapped_device_pointer(host); + break; + case DSA_MEMORY_MANAGED: + host = managed_malloc(size); + device = host; + break; + default: + printf("dsaX ERROR: Unknown memory type %d", type); + exit(0); + } + } +} + +dsaX_ptr::dsaX_ptr(void *ptr, dsaXMemoryType type) : type(type), reference(true) { + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + device = ptr; + host = nullptr; + break; + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: + device = nullptr; + host = ptr; + break; + case DSA_MEMORY_MANAGED: + device = ptr; + host = ptr; + break; + default: + printf("dsaX ERROR: Unsupported memory type %d", type); + exit(0); + } +} + +dsaX_ptr &dsaX_ptr::operator=(dsaX_ptr &&other) { + if (&other != this) { + if (size > 0) { + printf("dsaX ERROR: Cannot move to already initialized dsaX_ptr"); + } + type = std::exchange(other.type, DSA_MEMORY_INVALID); + size = std::exchange(other.size, 0); + pool = std::exchange(other.pool, false); + device = std::exchange(other.device, nullptr); + host = std::exchange(other.host, nullptr); + } + return *this; +} + +void dsaX_ptr::destroy() { + if (size > 0) { + switch (type) { + case DSA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; + case DSA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break; + case DSA_MEMORY_HOST: host_free(host); break; + case DSA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break; + case DSA_MEMORY_MAPPED: host_free(host); break; + default: + printf("Unknown memory type %d", type); + exit(0); + } + } + + size = 0; + device = nullptr; + host = nullptr; +} + +dsaX_ptr::~dsaX_ptr() { + destroy(); +} + +void dsaX_ptr::exchange(dsaX_ptr &obj, dsaX_ptr &&new_value) { + destroy(); + *this = std::move(obj); + obj = std::move(new_value); +} + +bool dsaX_ptr::is_device() const { + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + case DSA_MEMORY_MAPPED: + case DSA_MEMORY_MANAGED: return true; + default: return false; + } +} + +bool dsaX_ptr::is_host() const { + switch (type) { + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: + case DSA_MEMORY_MANAGED: return true; + default: return false; + } +} + +void *dsaX_ptr::data() const { + void *ptr = nullptr; + + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + case DSA_MEMORY_MAPPED: + case DSA_MEMORY_MANAGED: ptr = device; break; + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: ptr = host; break; + default: + printf("Unknown memory type %d", type); + exit(0); + } + + return ptr; +} + +void *dsaX_ptr::data_device() const { + if (!device) { + printf("dsaX ERROR: Device view not defined"); + exit(0); + } + return device; +} + +void *dsaX_ptr::data_host() const { + if (!host) { + printf("dsaX ERROR: Host view not defined"); + exit(0); + } + return host; +} + +bool dsaX_ptr::is_reference() const { return reference; } + +std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr) { + output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool + << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}"; + return output; +} diff --git a/src/dsaX_wrangle b/src/dsaX_wrangle deleted file mode 100755 index f839b14..0000000 Binary files a/src/dsaX_wrangle and /dev/null differ diff --git a/src/interface.cpp b/src/interface.cpp new file mode 100644 index 0000000..41e7caf --- /dev/null +++ b/src/interface.cpp @@ -0,0 +1,158 @@ +#include +#include +#include +#include + +#include "params.h" +#include "cuda_interface.h" +#include "utils.h" +#include "fast_time_domain.h" + +using namespace std; + +using ms = std::chrono::microseconds; +using hrc = std::chrono::high_resolution_clock; + +timer::Timer app_timer; +timer::Timer init_timer; + +void dsaXInit(int dev){ + app_timer.start(); +#if DSA_XENGINE_TARGET_CUDA + init_timer.start(); + dsaXInitCuda(dev); + initBLAS(); + init_timer.stop(); +#endif + cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl; + cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << endl; + cout << "NCHAN = " << NCHAN << endl; + cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << endl; + cout << "NPOL = " << NPOL << endl; + cout << "NARM = " << 2 << endl; +#if DSA_XENGINE_TARGET_CUDA + cout << "CUDA is ENABLED " << endl; +#else + cout << "CUDA is DISABLED " << endl; +#endif + cout << " --- End dsaX configuration --- " << endl; + //DMH: Add more (ask Vikram) +} + +void dsaXEnd() { + app_timer.stop(); + // output metrics + cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl; + cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl; +#if DSA_XENGINE_TARGET_CUDA + dsaXDestroyCuda(); +#endif +} + +void *dsaXHostRegister(size_t size) { +#if DSA_XENGINE_TARGET_CUDA + return dsaXHostRegisterCuda(size); +#endif +} + +void inspectPackedData(char input, int i, bool non_zeros) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(non_zeros) { + if(re != 0 || im != 0) + cout << "val["< &dimBlock, vector &dimGrid) { +#if DSA_XENGINE_TARGET_CUDA + transposeInputBeamformerCuda(input, output, dimBlock, dimGrid); +#else + cout << "dsaX error: not implemented" << endl; +#endif +} + +void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, vector &dimBlock, vector &dimGrid) { +#if DSA_XENGINE_TARGET_CUDA + transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid); +#else + cout << "dsaX error: not implemented" << endl; +#endif +} + +void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb) { +#if DSA_XENGINE_TARGET_CUDA + fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb); +#else + cout << "dsaX error: not implemented" << endl; +#endif +} + +void sumBeam(unsigned char *input, float *output, int blocks, int tpb) { +#if DSA_XENGINE_TARGET_CUDA + sumBeamCuda(input, output, blocks, tpb); +#else + cout << "dsaX error: not implemented" << endl; +#endif +} diff --git a/src/magma_interface.cu b/src/magma_interface.cu new file mode 100644 index 0000000..af91a52 --- /dev/null +++ b/src/magma_interface.cu @@ -0,0 +1,24 @@ +#include + +#include "dsaX.h" +#include "params.h" +#include "cuda_headers.h" +#include "magma_headers.h" + +using namespace std; + +void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) { +#if defined (DSA_XENGINE_TARGET_CUDA) +#if defined (DSA_XENGINE_ENABLE_MAGMA) + + // TO DO + +#else + std::cout << "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl; + exit(0); +#endif +#else + std::cout << "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl; + exit(0); +#endif +} diff --git a/src/malloc.cu b/src/malloc.cu new file mode 100644 index 0000000..55bad3f --- /dev/null +++ b/src/malloc.cu @@ -0,0 +1,631 @@ +#include "dsaX_malloc.h" + +#include "cuda_headers.h" +//#include "cuda_interface.h" +//#include "cuda_kernels.h" +//#include "cuda_handles.h" +// DMH: Everything in this file is CUDA aware. + +enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, SHMEM, N_ALLOC_TYPE }; + +class MemAlloc +{ + +public: + std::string func; + std::string file; + int line; + size_t size; + size_t base_size; + + MemAlloc() : line(-1), size(0), base_size(0) {} + + MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0) + { + } + + MemAlloc(const MemAlloc &) = default; + MemAlloc(MemAlloc &&) = default; + virtual ~MemAlloc() = default; + MemAlloc &operator=(const MemAlloc &) = default; + MemAlloc &operator=(MemAlloc &&) = default; +}; + +static std::map alloc[N_ALLOC_TYPE]; +static size_t total_bytes[N_ALLOC_TYPE] = {0}; +static size_t max_total_bytes[N_ALLOC_TYPE] = {0}; +static size_t total_host_bytes, max_total_host_bytes; +static size_t total_pinned_bytes, max_total_pinned_bytes; + +size_t device_allocated() { return total_bytes[DEVICE]; } + +size_t pinned_allocated() { return total_bytes[PINNED]; } + +size_t mapped_allocated() { return total_bytes[MAPPED]; } + +size_t managed_allocated() { return total_bytes[MANAGED]; } + +size_t host_allocated() { return total_bytes[HOST]; } + +size_t device_allocated_peak() { return max_total_bytes[DEVICE]; } + +size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; } + +size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; } + +size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; } + +size_t host_allocated_peak() { return max_total_bytes[HOST]; } + +static void print_trace(void) +{ + void *array[10]; + size_t size; + char **strings; + size = backtrace(array, 10); + strings = backtrace_symbols(array, size); + printf("Obtained %zd stack frames.\n", size); + for (size_t i = 0; i < size; i++) printf("%s\n", strings[i]); + free(strings); +} + +static void print_alloc_header() +{ + printf("Type Pointer Size Location\n"); + printf("----------------------------------------------------------\n"); +} + +static void print_alloc(AllocType type) +{ + const char *type_str[] = {"Device", "Device Pinned", "Host ", "Pinned", "Mapped", "Managed", "Shmem "}; + + for (auto entry : alloc[type]) { + void *ptr = entry.first; + MemAlloc a = entry.second; + printf("%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(), + a.file.c_str(), a.line); + } +} + +static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr) +{ + total_bytes[type] += a.base_size; + if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; } + if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { + total_host_bytes += a.base_size; + if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; } + } + if (type == PINNED || type == MAPPED) { + total_pinned_bytes += a.base_size; + if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; } + } + alloc[type][ptr] = a; +} + +static void track_free(const AllocType &type, void *ptr) +{ + size_t size = alloc[type][ptr].base_size; + total_bytes[type] -= size; + if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { total_host_bytes -= size; } + if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; } + alloc[type].erase(ptr); +} + +void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *host) +{ + void *device; + auto error = cudaHostGetDevicePointer(&device, const_cast(host), 0); + if (error != cudaSuccess) { + printf("dsaX ERROR: cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line, + func); + } + return device; +} + +bool use_managed_memory() { + static bool managed = false; + static bool init = false; + + if (!init) { + char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY"); + if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) { + printf("dsaX ERROR: Using managed memory for CUDA allocations"); + managed = true; + + //if (!device::managed_memory_supported()) printf("dsaX WARNING: Target device does not report supporting managed memory"); + } + + init = true; + } + + return managed; +} + +/** + * Free device memory allocated with device_malloc(). This function + * should only be called via the device_free() macro, defined in + * malloc_quda.h + */ +void managed_free_(const char *func, const char *file, int line, void *ptr) { + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[MANAGED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + cudaError_t err = cudaFree(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MANAGED, ptr); +} + + +/** + * Free host memory allocated with safe_malloc(), pinned_malloc(), + * or mapped_malloc(). This function should only be called via the + * host_free() macro, defined in dsaX_malloc.h + */ +void host_free_(const char *func, const char *file, int line, void *ptr) { + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (alloc[HOST].count(ptr)) { + track_free(HOST, ptr); + free(ptr); + } else if (alloc[PINNED].count(ptr)) { + cudaError_t err = cudaHostUnregister(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(PINNED, ptr); + free(ptr); + } else if (alloc[MAPPED].count(ptr)) { +#ifdef HOST_ALLOC + cudaError_t err = cudaFreeHost(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free host memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MAPPED, ptr); +#else + cudaError_t err = cudaHostUnregister(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MAPPED, ptr); + free(ptr); +#endif + } else { + printf("dsaX ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func); + print_trace(); + printf("dsaX ERROR: Aborting"); + exit(0); + } +} + + +/** + * Perform a standard cudaMalloc() with error-checking. This + * function should only be called via the device_malloc() macro, + * defined in dsaX_malloc.h + */ +void *device_malloc_(const char *func, const char *file, int line, size_t size) { + + if (use_managed_memory()) return managed_malloc_(func, file, line, size); + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + cudaError_t err = cudaMalloc(&ptr, size); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + + // DMH: GET ON THIS! + //if (is_prefetch_enabled()) dsaXMemPrefetchAsync(ptr, size, DSA_CUDA_FIELD_LOCATION, get_default_stream()); + track_malloc(DEVICE, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + +/** + * Free device memory allocated with device_malloc(). This function + * should only be called via the device_free() macro, defined in + * dsaX_malloc.h + */ +void device_free_(const char *func, const char *file, int line, void *ptr) { + + if (use_managed_memory()) { + managed_free_(func, file, line, ptr); + return; + } + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + + cudaError_t err = cudaFree(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + + track_free(DEVICE, ptr); +} + +/** + * Free device memory allocated with device_pinned malloc(). This + * function should only be called via the device_pinned_free() + * macro, defined in dsaX_malloc.h + */ +void device_pinned_free_(const char *func, const char *file, int line, void *ptr) { + + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed. + //if (!comm_peer2peer_present()) { + //device_free_(func, file, line, ptr); + //return; + //} + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE_PINNED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + CUresult err = cuMemFree((CUdeviceptr)ptr); + if (err != CUDA_SUCCESS) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(DEVICE_PINNED, ptr); +} + + +/** + * Under CUDA 4.0, cudaHostRegister seems to require that both the + * beginning and end of the buffer be aligned on page boundaries. + * This local function takes care of the alignment and gets called + * by pinned_malloc_() and mapped_malloc_() + */ +static void *aligned_malloc(MemAlloc &a, size_t size) { + void *ptr = nullptr; + + a.size = size; + + // we need to manually align to page boundaries to allow us to bind a texture to mapped memory + static int page_size = 2 * getpagesize(); + a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size + int align = posix_memalign(&ptr, page_size, a.base_size); + if (!ptr || align != 0) { + printf("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line, + a.func.c_str()); + exit(0); + } + return ptr; +} + +/** + * Perform a standard malloc() with error-checking. This function + * should only be called via the safe_malloc() macro, defined in + * malloc_quda.h + */ +void *safe_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + a.size = a.base_size = size; + + void *ptr = malloc(size); + if (!ptr) { + printf("dsaX ERROR: Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(HOST, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, size); +#endif + return ptr; +} + +/** + * Allocate page-locked ("pinned") host memory, and map it into the + * GPU address space. This function should only be called via the + * mapped_malloc() macro, defined in malloc_quda.h + */ +void *mapped_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + + void *ptr = aligned_malloc(a, size); + cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped | cudaHostRegisterPortable); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(MAPPED, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, a.base_size); +#endif + return ptr; +} + +/** + * Perform a standard cudaMallocManaged() with error-checking. This + * function should only be called via the managed_malloc() macro, + * defined in dsaX_malloc.h + */ +void *managed_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + cudaError_t err = cudaMallocManaged(&ptr, size); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(MANAGED, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + + +/** + * Perform a cuMemAlloc with error-checking. This function is to + * guarantee a unique memory allocation on the device. This + * should only be called via the device_pinned_malloc() macro, + * defined in dsaX_malloc.h. + */ +void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size) { + + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed. + //if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size); + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + CUresult err = cuMemAlloc((CUdeviceptr *)&ptr, size); + if (err != CUDA_SUCCESS) { + printf("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(DEVICE_PINNED, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + + +/** + * Allocate page-locked ("pinned") host memory. This function + * should only be called via the pinned_malloc() macro, defined in + * dsaX_malloc.h + * + * Note that we do not rely on cudaHostAlloc(), since buffers + * allocated in this way have been observed to cause problems when + * shared with MPI via GPU Direct on some systems. + */ +void *pinned_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + void *ptr = aligned_malloc(a, size); + + cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault); + if (err != cudaSuccess) { + printf("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(PINNED, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, a.base_size); +#endif + return ptr; +} + +namespace mem_pool { + + /** Cache of inactive pinned-memory allocations. We cache pinned + memory allocations so that fields can reuse these with minimal + overhead. + */ + static std::multimap pinnedCache; + + /** Sizes of active pinned-memory allocations. For convenience, + we keep track of the sizes of active allocations (i.e., those not + in the cache). + */ + static std::map pinnedSize; + + /** Cache of inactive device-memory allocations. We cache pinned + memory allocations so that fields can reuse these with minimal + overhead. + */ + static std::multimap deviceCache; + + /** Sizes of active device-memory allocations. For convenience, + we keep track of the sizes of active allocations (i.e., those not + in the cache). + */ + static std::map deviceSize; + + static bool pool_init = false; + + /** whether to use a memory pool allocator for device memory */ + static bool device_memory_pool = true; + + /** whether to use a memory pool allocator for pinned memory */ + static bool pinned_memory_pool = true; + + void init() { + if (!pool_init) { + // device memory pool + char *enable_device_pool = getenv("DSAX_ENABLE_DEVICE_MEMORY_POOL"); + if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) { + printf("dsaX Warning: Using device memory pool allocator"); + device_memory_pool = true; + } else { + printf("dsaX Warning: Not using device memory pool allocator"); + device_memory_pool = false; + } + + // pinned memory pool + char *enable_pinned_pool = getenv("DSAX_ENABLE_PINNED_MEMORY_POOL"); + if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) { + printf("dsaX Warning: Using pinned memory pool allocator"); + pinned_memory_pool = true; + } else { + printf("dsaX Warning: Not using pinned memory pool allocator"); + pinned_memory_pool = false; + } + pool_init = true; + } + } + void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes) { + void *ptr = nullptr; + if (pinned_memory_pool) { + if (pinnedCache.empty()) { + ptr = pinned_malloc_(func, file, line, nbytes); + } else { + auto it = pinnedCache.lower_bound(nbytes); + if (it != pinnedCache.end()) { // sufficiently large allocation found + nbytes = it->first; + ptr = it->second; + pinnedCache.erase(it); + } else { // sacrifice the smallest cached allocation + it = pinnedCache.begin(); + ptr = it->second; + pinnedCache.erase(it); + host_free(ptr); + ptr = pinned_malloc_(func, file, line, nbytes); + } + } + pinnedSize[ptr] = nbytes; + } else { + ptr = pinned_malloc_(func, file, line, nbytes); + } + return ptr; + } + + void pinned_free_(const char *func, const char *file, int line, void *ptr) { + if (pinned_memory_pool) { + if (!pinnedSize.count(ptr)) { + printf("dsaX Error: Attempt to free invalid pointer"); + exit(0); + } + pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr)); + pinnedSize.erase(ptr); + } else { + host_free_(func, file, line, ptr); + } + } + + void *device_malloc_(const char *func, const char *file, int line, size_t nbytes) { + void *ptr = nullptr; + if (device_memory_pool) { + if (deviceCache.empty()) { + ptr = device_malloc_(func, file, line, nbytes); + } else { + auto it = deviceCache.lower_bound(nbytes); + if (it != deviceCache.end()) { // sufficiently large allocation found + nbytes = it->first; + ptr = it->second; + deviceCache.erase(it); + } else { // sacrifice the smallest cached allocation + it = deviceCache.begin(); + ptr = it->second; + deviceCache.erase(it); + device_free_(func, file, line, ptr); + ptr = device_malloc_(func, file, line, nbytes); + } + } + deviceSize[ptr] = nbytes; + } else { + ptr = device_malloc_(func, file, line, nbytes); + } + return ptr; + } + + /** + * Free device memory allocated with device_pinned malloc(). This + * function should only be called via the device_pinned_free() + * macro, defined in malloc_quda.h + */ + void device_pinned_free_(const char *func, const char *file, int line, void *ptr) { + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed + //if (!comm_peer2peer_present()) { + //device_free_(func, file, line, ptr); + //return; + //} + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE_PINNED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + CUresult err = cuMemFree((CUdeviceptr)ptr); + if (err != CUDA_SUCCESS) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(DEVICE_PINNED, ptr); + } + + + void device_free_(const char *func, const char *file, int line, void *ptr) { + if (device_memory_pool) { + if (!deviceSize.count(ptr)) { + printf("dsaX Error: Attempt to free invalid pointer"); + exit(0); + } + deviceCache.insert(std::make_pair(deviceSize[ptr], ptr)); + deviceSize.erase(ptr); + } else { + device_free_(func, file, line, ptr); + } + } + + void flush_pinned() { + if (pinned_memory_pool) { + for (auto it : pinnedCache) { host_free(it.second); } + pinnedCache.clear(); + } + } + + void flush_device() { + if (device_memory_pool) { + for (auto it : deviceCache) { device_free(it.second); } + deviceCache.clear(); + } + } +} // namespace pool diff --git a/src/params.cpp b/src/params.cpp new file mode 100644 index 0000000..723264c --- /dev/null +++ b/src/params.cpp @@ -0,0 +1,104 @@ +#include + +#include "params.h" + +using namespace std; + +const char *getBLASLibString(dsaXBLASLib lib) +{ + const char *ret; + + switch (lib) { + case DSA_BLAS_LIB_CUBLAS: ret = "CUBLAS"; break; + case DSA_BLAS_LIB_MAGMA: ret = "MAGMA"; break; + case DSA_BLAS_LIB_CUTLASS: ret = "CUTLAS"; break; + case DSA_BLAS_LIB_OPENBLAS: ret = "OPENBLAS"; break; + case DSA_BLAS_LIB_NATIVE: ret = "NATIVE"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +const char *getBLASDataTypeString(dsaXBLASDataType type) +{ + const char *ret; + + switch (type) { + case DSA_BLAS_DATATYPE_H: ret = "Half"; break; + case DSA_BLAS_DATATYPE_S: ret = "Single"; break; + case DSA_BLAS_DATATYPE_D: ret = "Double"; break; + case DSA_BLAS_DATATYPE_HC: ret = "Complex(half)"; break; + case DSA_BLAS_DATATYPE_C: ret = "Complex(single)"; break; + case DSA_BLAS_DATATYPE_Z: ret = "Complex(double)"; break; + case DSA_BLAS_DATATYPE_4b_REAL: ret = "4b sized real"; break; + case DSA_BLAS_DATATYPE_2b_REAL: ret = "2b sized real"; break; + case DSA_BLAS_DATATYPE_4b_COMPLEX: ret = "Char sized complex (4b,4b)"; break; + case DSA_BLAS_DATATYPE_2b_COMPLEX: ret = "4b sized (2b,2b)"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +const char *getBLASDataOrderString(dsaXBLASDataOrder order) +{ + const char *ret; + + switch (order) { + case DSA_BLAS_DATAORDER_ROW: ret = "Row order"; break; + case DSA_BLAS_DATAORDER_COL: ret = "Column order"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +void printDsaXCorrParam(const dsaXCorrParam param) { + + cout << " --- dsaXCorrParam begin ---" << endl; + cout << "struct_size = " << param.struct_size << endl; + cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; + cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; + cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; + cout << "n_streams = " << param.n_streams << endl; + + cout << " --- dsaXCorrParam end ---" << endl; +} + +void printDsaXBLASParam(const dsaXBLASParam param) { + + cout << " --- dsaXBLASParam begin ---" << endl; + cout << "struct_size = " << param.struct_size << endl; + cout << "blas_type = " << param.blas_type << endl; + cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; + cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; + cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; + cout << "trans_a = " << param.trans_a << endl; + cout << "trans_b = " << param.trans_b << endl; + cout << "m = " << param.m << endl; + cout << "n = " << param.n << endl; + cout << "k = " << param.k << endl; + cout << "lda = " << param.lda << endl; + cout << "ldb = " << param.ldb << endl; + cout << "ldc = " << param.ldc << endl; + cout << "a_offset = " << param.a_offset << endl; + cout << "b_offset = " << param.b_offset << endl; + cout << "c_offset = " << param.c_offset << endl; + cout << "a_stride = " << param.a_stride << endl; + cout << "b_stride = " << param.b_stride << endl; + cout << "c_stride = " << param.c_stride << endl; + cout << "alpha = " << param.alpha << endl; + cout << "beta = " << param.beta << endl; + cout << "batch_count = " << param.batch_count << endl; + cout << " --- dsaXBLASParam end ---" << endl; +} + +dsaXCorrParam newDsaXCorrParam(void) { + dsaXCorrParam new_param; + new_param.struct_size = sizeof(new_param); + new_param.blas_lib = DSA_BLAS_LIB_INVALID; + new_param.data_type = DSA_BLAS_DATATYPE_INVALID; + new_param.data_order = DSA_BLAS_DATAORDER_INVALID; + return new_param; +} diff --git a/src/planar_complex.cu b/src/planar_complex.cu new file mode 100644 index 0000000..3fb8175 --- /dev/null +++ b/src/planar_complex.cu @@ -0,0 +1,87 @@ +/* +#include +#include +#include +#include + +int main() { + + cutlass::half_t x = 2.25_hf; + + std::cout << x << std::endl; + + return 0; +} +*/ + +#include +#include + +#include + +int main() { + + // Define the GEMM operation + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, // ElementA + cutlass::layout::ColumnMajor, // LayoutA + cutlass::half_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + cutlass::half_t, // ElementOutput + cutlass::layout::ColumnMajor, // LayoutOutput + float, // ElementAccumulator + cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores + cutlass::arch::Sm75 // tag indicating target GPU compute architecture + >; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::HostTensor A({M, K}); + cutlass::HostTensor B({K, N}); + cutlass::HostTensor C({M, N}); + + cutlass::half_t const *ptrA = A.device_data(); + cutlass::half_t const *ptrB = B.device_data(); + cutlass::half_t const *ptrC = C.device_data(); + cutlass::half_t *ptrD = C.device_data(); + + int lda = A.device_ref().stride(0); + int ldb = B.device_ref().stride(0); + int ldc = C.device_ref().stride(0); + int ldd = C.device_ref().stride(0); + // + // Launch GEMM on the device + // + + status = gemm_op({ + {M, N, K}, + {ptrA, lda}, // TensorRef to A device tensor + {ptrB, ldb}, // TensorRef to B device tensor + {ptrC, ldc}, // TensorRef to C device tensor + {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C + {alpha, beta} // epilogue operation arguments + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } else { + std::cout << "CUTLASS Success! " << std::endl; + } + + return 0; +} diff --git a/src/psrdada_utils.cpp b/src/psrdada_utils.cpp new file mode 100644 index 0000000..3978ecd --- /dev/null +++ b/src/psrdada_utils.cpp @@ -0,0 +1,11 @@ +#include "psrdada_utils.h" + +void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out) +{ + if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in"); + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out"); + dada_hdu_destroy (out); + +} diff --git a/src/splice_offline_beams b/src/splice_offline_beams deleted file mode 100755 index 728af8c..0000000 Binary files a/src/splice_offline_beams and /dev/null differ diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100644 index 0000000..cc4194d --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,59 @@ +#include + +#include "utils.h" +#include "enums.h" +#include "params.h" +#include "cuda_interface.h" + +using namespace std; + +void dsaXmemset(void *array, int ch, size_t n){ +#ifdef DSA_XENGINE_TARGET_CUDA + dsaXmemsetCuda(array, ch, n); +#else + memset(array, ch, n); +#endif +} + +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){ + +#ifdef DSA_XENGINE_TARGET_CUDA + // Perform host to device memcopy on data + dsaXmemcpyCuda(array_out, array_in, n, kind, stream); +#else + memcpy(array_out, array_in, n); +#endif +} + +void dsaXDeviceSynchronize() { +#ifdef DSA_XENGINE_TARGET_CUDA + // Synchronise the device + dsaXDeviceSynchronizeCuda(); +#else + // NO OP +#endif +} + +void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams) { + +#ifdef DSA_XENGINE_TARGET_CUDA + d->dev_malloc_timer.start(); + initializeCorrCudaMemory(d, n_streams); + d->dev_malloc_timer.stop(); +#else + cout << "dsaX Error: Not implemented." << endl; + exit(0); +#endif +} + +void destroyDsaXCorrDeviceMemory(corr_handle *d) { + +#ifdef DSA_XENGINE_TARGET_CUDA + d->dev_malloc_timer.start(); + deallocateCorrCudaMemory(d); + d->dev_malloc_timer.stop(); +#else + cout << "dsaX Error: Not implemented." << endl; + exit(0); +#endif +} diff --git a/src/version.cpp b/src/version.cpp new file mode 100644 index 0000000..1c8114b --- /dev/null +++ b/src/version.cpp @@ -0,0 +1,5 @@ +#ifdef GITVERSION +const char* gitversion = GITVERSION ; +#else +const char* gitversion; +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..64aa8db --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,8 @@ +#DMH: fix include path +include_directories(${CMAKE_SOURCE_DIR}/include) +include_directories(${CLI11_SOURCE_DIR}/include/CLI) + +add_library(dsaX_tests command_line_params.cpp) + +add_executable(correlator_test correlator_test.cpp) +target_link_libraries(correlator_test dsaX dsaX_tests) diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp new file mode 100644 index 0000000..82c02e8 --- /dev/null +++ b/tests/command_line_params.cpp @@ -0,0 +1,56 @@ +#include + +// General +int core = 0; +bool debug = false; + +// Data block HDU keys +key_t in_key = 0x0000eada; // REORDER_BLOCK_KEY in dsaX_def.h +key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h + +// Test params +bool run_beamformer = false; +bool run_correlator = false; +bool input_rands = true; +bool write_output = false; +int test_iter = 1; +int n_streams = 8; + +// Test files +std::string input_filename = "input.dat"; +std::string output_filename = "output.dat"; + +// DSA hardware configuration +int n_channels = 384; +int n_antennae = 63; +int n_pol = 2; +int n_times = 30720; +double start_frequency = 1498.75; + +std::shared_ptr make_app(std::string app_description, std::string app_name) { + + auto dsaX_app = std::make_shared(app_description, app_name); + dsaX_app->option_defaults()->always_capture_default(); + + dsaX_app->add_option("--core", core, "Bind process to this CPU core [default 0]"); + dsaX_app->add_option("--debug", debug, "Send debug messages to syslog"); + dsaX_app->add_option("--in-key", in_key, "[default REORDER_BLOCK_KEY]"); + dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]"); + dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]"); + dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]"); + dsaX_app->add_option("--test-iter", test_iter, "Run the test 'test_iter' times [default 1]"); + dsaX_app->add_option("--write-output", write_output, "Write output to disk [default true]"); + dsaX_app->add_option("--n-streams", n_streams, "The number of device streams [default 10]"); + + // Input file options + dsaX_app->add_option("--input-rands", input_rands, "Generate random input (default false)"); + dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests"); + + dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results"); + dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]"); + dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]"); + dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]"); + dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]"); + dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)"); + return dsaX_app; +} diff --git a/tests/command_line_params.h b/tests/command_line_params.h new file mode 100644 index 0000000..fb9bd1a --- /dev/null +++ b/tests/command_line_params.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +class dsaXApp : public CLI::App { + +public: + dsaXApp(std::string app_description = "", std::string app_name = "") : CLI::App(app_description, app_name) {}; + + virtual ~dsaXApp() {}; +}; + +std::shared_ptr make_app(std::string app_description = "dsaX internal test", std::string app_name = ""); + +// General +extern int core; +extern bool debug; + +// Data block HDU keys +extern key_t in_key; +extern key_t out_key; + +// Test mode +extern bool run_beamformer; +extern bool run_correlator; +extern bool input_rands; +extern bool write_output; +extern int test_iter; +extern int n_streams; + +// DSA hardware configureation +extern std::string input_filename; +extern std::string output_filename; +extern int n_channels; +extern int n_antennae; +extern int n_pol; +extern int n_times; +extern double start_frequency; diff --git a/tests/correlator_test.cpp b/tests/correlator_test.cpp new file mode 100644 index 0000000..3cdc699 --- /dev/null +++ b/tests/correlator_test.cpp @@ -0,0 +1,367 @@ +#include //DMH: replace with CLI +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +// Include this file to access input parameters +#include "command_line_params.h" + +// Include the dsaX.h header in your application +#include + +// Include this file to access test utilities +/** + * Promote complex char riri... data to planar half rr.. ii.. + * + * @param[out] inr float precision real array + * @param[out] ini float precision imag array + * @param[in] input char precision complex array + * @param[in] rows number of rows + * @param[in] cols number of cols + */ +template void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) { + +#pragma omp parallel for collapse(2) + int idx = 0; + for(int i=0; i> 4); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + output[2*idx+1] = (prec)((char)(( (unsigned char)(input[idx]) & (unsigned char)(240) )) >> 4); + } + } +} + +// Assume ROW ordered data in interleaved format +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) { + +#pragma omp parallel for collapse(2) + for(int i=0; i prec test_hermiticity(const prec *C, const int m, const int n) { + + prec frob_norm = 0.0; + +#pragma omp parallel for collapse(2) reduction (+:frob_norm) + for(int i=0; i +{ +public: + + using vector::vector; + + bool loadFromFile(const char *fileName) noexcept + { + // Try to open a file specified by its name + ifstream file(fileName, ios::in | ios::binary); + if (!file.is_open() || file.bad()) + return false; + + // Clear whitespace removal flag + file.unsetf(ios::skipws); + + // Determine size of the file + file.seekg(0, ios_base::end); + size_t fileSize = file.tellg(); + file.seekg(0, ios_base::beg); + + // Discard previous vector content + resize(0); + reserve(0); + shrink_to_fit(); + + // Order to prealocate memory to avoid unnecessary reallocations due to vector growth + reserve(fileSize); + + // Read entire file content into prealocated vector memory + insert(begin(), + istream_iterator(file), + istream_iterator()); + + // Make sure entire content is loaded + if(size() == fileSize) { + cout << "Successfully read file of size " << fileSize << endl; + return true; + } else { + cout << "Unexpected file size." << endl; + return false; + } + } + + bool saveToFile(const char *fileName) const noexcept + { + // Write entire vector content into a file specified by its name + ofstream file(fileName, ios::out | ios::binary); + try { + file.write((const char *) data(), size()); + } + catch (...) { + return false; + } + + // Determine number of bytes successfully stored in file + size_t fileSize = file.tellp(); + if(size() == fileSize) { + cout << "Successfully wrote file of size " << fileSize << endl; + return true; + } else { + cout << "Unexpected file size." << endl; + return false; + } + } +}; +*/ +int main(int argc, char **argv) { + + // Parse command line + auto app = make_app(); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } + + int device_ordinal = 0; + int packet_size = 4608; + + // Create a data array for a single call to the correlator class + FILE *fin, *fout; + uint64_t sz, in_block_size, rd_size; + in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + + cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << endl; + char *file_data = (char *)malloc(in_block_size); + + // read one block of input data + // get size of file + if(!input_rands) { + cout << "attempting to read file " << input_filename.c_str() << endl; + fin = fopen(input_filename.c_str(), "rb"); + fseek(fin, 0L, SEEK_END); + sz = ftell(fin); + if(sz != packet_size) { + cout << "Error: packet size " << packet_size << " and file size " << sz << " are unequal." << endl; + exit(0); + } + rewind(fin); + + // figure out how many reps and chunks to read with + int nreps, nchunks; + if (sz > in_block_size) { + nreps = (int)(sz/in_block_size); + rd_size = in_block_size; + } + else { + nchunks = (int)(in_block_size/sz); + rd_size = sz; + } + + cout << "Packet size = " << sz << endl; + cout << "rd size = " << rd_size << endl; + for (int reps = 0; reps dis; + for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen); + //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234; + memcpy(file_data, (void*)input_rand, n_rand); + free(input_rand); + } + + // Start dsaX program + //--------------------------------------- + timer::Timer test_timer; + + dsaXInit(device_ordinal); + + // Create Correlator class instance. + dsaXCorrParam param = newDsaXCorrParam(); + param.blas_lib = DSA_BLAS_LIB_CUBLAS; + param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX; + param.data_order = DSA_BLAS_DATAORDER_ROW; + param.n_streams = n_streams; + printDsaXCorrParam(param); + + auto correlator = new Correlator(¶m); + + // Create GPU registered memory if using CUDA + uint64_t input_size = n_streams*sizeof(char)*in_block_size; + cout << "Creating char input array of size " << input_size << " bytes." << endl; + void *input_data = dsaXHostRegister(input_size); + // Populate with random data. Each stream has the same data + // To ensure the concurrency does not pollute accross streams. + for (int i = 0; icompute(output_data, input_data); + test_timer.stop(); + + float frob_norm = test_hermiticity((float*)output_data, 96, 96); + cout << "Frobenius norm = " << frob_norm << endl; + + //cout << "Output peek " << endl; + float *p = (float*)output_data; + for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl; + + if(write_output) { + fout = fopen(output_filename.c_str(),"ab"); + fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout); + fclose(fout); + } + + delete correlator; + dsaXEnd(); + + cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl; + + // End dsaX program + //--------------------------------------- + + // free local data + free(input_data); + free(output_data); + return 0; + + /* + // Read data + BinaryFileVector binaryFileVector; + + + if (!binaryFileVector.loadFromFile(test_filename.c_str())) { + cout << "Failed to read the file." << endl; + return 0; + } + + // read one block of input data + for (int i=0;i<512;i++) { + //fin = fopen(test_filename,"rb"); + //fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin); + //fclose(fin); + } + + for (int i=0;i<512;i++) { + memcpy(input_data + i*binaryFileVector.size(), binaryFileVector.data(), binaryFileVector.size()); + } + + // Peek at input data (delete after development is complete) + for (int i=0; i<8; i++) inspectPackedData(input_data[i], i); + + // Peek at output data (delete after development is complete) + for (int i=0; i void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) { + +#pragma omp parallel for collapse(2) + int idx = 0; + for(int i=0; i> 4); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + output[2*idx+1] = (prec)((char)(( (unsigned char)(input[2*idx+1]) & (unsigned char)(240) )) >> 4); + } + } +} + +// Assume ROW ordered data in interleaved format +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) { + +#pragma omp parallel for collapse(2) + for(int i=0; i prec test_hermiticity(const prec *C, const int m, const int n) { + + prec frob_norm = 0.0; + +#pragma omp parallel for collapse(2) reduction (+:frob_norm) + for(int i=0; i void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols); +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k); +template prec test_hermiticity(const prec *C, const int m, const int n); diff --git a/utils/.gitignore b/utils/.gitignore deleted file mode 100644 index dafcc02..0000000 --- a/utils/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -antennas.out -gen_antennas.py diff --git a/utils/gen_packet.py b/utils/gen_packet.py index 8803832..7ae8ab4 100644 --- a/utils/gen_packet.py +++ b/utils/gen_packet.py @@ -84,8 +84,8 @@ def histo_test(data): n_packet = 4608 # 4608 for single packet # decide which sort of packet to make -noise = True -tone = False +noise = False +tone = True x16 = False # if tone @@ -110,8 +110,8 @@ def histo_test(data): # make packet real_part = np.zeros(n_packet,dtype='int8') imag_part = np.zeros(n_packet,dtype='int8') - for ant in [0,1,2]: - for i in chans: + for ant in [0,1,2]: # 3 antennae + for i in chans: # 384 channels # time 1 pol A j = int(1536*ant + i*4) diff --git a/utils/gen_testblock.py b/utils/gen_testblock.py index ab607b2..2eb0f6e 100644 --- a/utils/gen_testblock.py +++ b/utils/gen_testblock.py @@ -1,6 +1,6 @@ import numpy as np, struct import matplotlib.pyplot as plt - +import os ''' The aim here is to make data blocks to test the bfCorr code. @@ -9,69 +9,42 @@ Structure of a block is [2048 packets, 32 channel groups, ...] + +We want the real and imagniary parts to be random integers over +the range of [-8, 7] +======= ''' # defaults outfile = 'block.out' -n_packet = 4608 # 4608 for single packet -npackets = 4 -nchangs = 32 +if os.path.exists(outfile): + os.remove(outfile) + -# make a block where every s -chans = np.arange(384)#np.asarray([10,100,190] -v1 = 1 -v2 = 2 -v3 = 3 -v4 = 4 -v5 = 5 -v6 = 6 +num_packets = 4 +n_antennae = 3 +n_chans = 384 +n_changs = 32 -vals = [-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7] +# make values in the range vals = [-8, 7] # [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] -for ipacket in np.arange(npackets): - - print(ipacket) - ant_number = 0 - for ichang in np.arange(nchangs): - - real_part = np.zeros(n_packet,dtype='int8') - imag_part = np.zeros(n_packet,dtype='int8') - - for i in np.arange(3): - for j in np.arange(384): - for k in np.arange(4): - - #v1 = 32.*(j/384.+0.8)*np.random.normal() - #v2 = 32.*(j/384.+0.8)*np.random.normal() - v1 = 32.*(((j % 9)-5)+(i+ipacket-3)) -# if i==0: -# if k==0: -# if ipacket==0: -# print(j,v1/32.) - v2 = 0. - ii = i*1536+j*4+k - - real_part[ii] = v1 - imag_part[ii] = v2 - - # make 4-bit versions - real_part = np.cast['uint8'](real_part) - imag_part = np.cast['uint8'](imag_part) - for i in range(n_packet): - real_part[i] = real_part[i] >> 4 - imag_part[i] = (imag_part[i] >> 4) << 4 - - # finish packet - packet = np.zeros(n_packet,dtype='uint8') - for i in range(n_packet): - packet[i] = real_part[i] | imag_part[i] - - out_str = packet.tobytes() +for ipacket in np.arange(num_packets): + print(ipacket) + for ichang in np.arange(n_changs): + + packet = np.zeros(num_packets*n_changs, dtype='uint8') + for i in np.arange(n_antennae): + for j in np.arange(n_chans): + for k in np.arange(num_packets): + + # we now make a randon integer iunt8 format + idx = ichang + n_changs*ipacket + packet[idx] = np.random.randint(0, 256) + + out_str = packet.tobytes() newFile = open(outfile, "ab") newFile.write(out_str) newFile.close() - - diff --git a/utils/packet.out b/utils/packet.out deleted file mode 100644 index 435ed74..0000000 Binary files a/utils/packet.out and /dev/null differ diff --git a/utils/test.out b/utils/test.out deleted file mode 100644 index d684e88..0000000 Binary files a/utils/test.out and /dev/null differ