From 6fee8e2b2ff99fd8717932d7be3ef8d783feb78b Mon Sep 17 00:00:00 2001 From: cpviolator Date: Tue, 4 Jun 2024 18:04:07 -0700 Subject: [PATCH 01/30] Added some CMake functionality, missing XGPU, sigproc, and fitsio dependencies --- CMakeLists.txt | 125 +++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 119 +++++++++++++++++++++++++++++++ src/Makefile | 1 - src/cuda_correlator.cu | 2 +- src/dsaX_beamformer.cu | 9 ++- src/dsaX_beamformer_offline.cu | 4 +- src/dsaX_beamformer_passon.cu | 10 +-- src/dsaX_bfCorr.cu | 4 +- src/dsaX_capture.c | 4 +- src/dsaX_capture_manythread.c | 6 +- src/dsaX_capture_thread.c | 8 +-- src/dsaX_copydb.c | 6 +- src/dsaX_fake.c | 8 +-- src/dsaX_filTrigger.c | 16 ++--- src/dsaX_fluff.c | 6 +- src/dsaX_merge.c | 8 +-- src/dsaX_nicdb.c | 4 +- src/dsaX_reorder.c | 6 +- src/dsaX_reorder_raw.c | 11 +-- src/dsaX_simplesplit.c | 8 +-- src/dsaX_split.c | 10 +-- src/dsaX_splitup.c | 6 +- src/dsaX_store.c | 6 +- src/dsaX_trigger.c | 18 ++--- src/dumpfil.c | 4 +- src/fil2dada.c | 33 +++++---- src/test_read.c | 2 +- src/test_write.c | 9 ++- utils/packet.out | Bin 4608 -> 4608 bytes 29 files changed, 357 insertions(+), 96 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 src/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..66682b6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,125 @@ +#################################################################################### +# START 1. Basic setup for cmake +#################################################################################### +# basic setup for cmake +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +if(POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON) +set(CMAKE_COLOR_MAKEFILE ON) +set(CMAKE_CXX_STANDARD_REQUIRED True) +# Disable gnu exentions +set(CMAKE_CXX_EXTENSIONS ON) + +# Define the project +project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C) + +# DSA_XENGINE may be built to run using CUDA. Future version may be +# written for HIP or SYCL, which we call the +# Target type. By default, the target is CUDA. +if(DEFINED ENV{DSA_XENGINE_TARGET}) + set(DEFTARGET $ENV{DSA_XENGINE_TARGET}) +else() + set(DEFTARGET "CUDA") +endif() + +set(VALID_TARGET_TYPES CUDA) #HIP SYCL +set(DSA_XENGINE_TARGET_TYPE + "${DEFTARGET}" + CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") +set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS CUDA) + +# CUDA specific part of CMakeLists +#set(CMAKE_CUDA_EXTENSIONS OFF) +find_package(CUDAToolkit REQUIRED) + +if(DEFINED ENV{DSA_XENGINE_GPU_ARCH}) + set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH}) +else() + set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_70) +endif() +if(NOT DSA_XENGINE_GPU_ARCH) + message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}") +endif() + +set(DSA_XENGINE_GPU_ARCH + ${DSA_XENGINE_DEFAULT_GPU_ARCH} + CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)") +set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90) +set(DSA_XENGINE_GPU_ARCH_SUFFIX + "" + CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") +set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ") +#set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH}) +mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) +mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) + +string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE) +list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) + +if(TARGET_TYPE_VALID LESS 0) + message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") +endif() + +# Git +find_package(Git) +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} show + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + RESULT_VARIABLE IS_GIT_REPOSIITORY + OUTPUT_QUIET ERROR_QUIET) + if(${IS_GIT_REPOSIITORY} EQUAL 0) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --abbrev=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITTAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but + # that might not always be available + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMAND wc -l + OUTPUT_VARIABLE GITCOUNT + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --match 1 --always --long --dirty + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITVERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +endif(GIT_FOUND) + +# EXTERNALS +include(FetchContent) +# Get psrdada dependency +option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON) +if(DSA_XENGINE_DOWNLOAD_PSRDADA) + FetchContent_Declare( + PSRDada + GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code + GIT_TAG 008afa70393ae2df11efba0cc8d0b95cda599c02 + ) + FetchContent_MakeAvailable(PSRDada) +endif() + +option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON) +if(DSA_XENGINE_DOWNLOAD_XGPU) + FetchContent_Declare( + XGPU + GIT_REPOSITORY https://github.com/GPU-correlators/xGPU.git + GIT_TAG 7e85bd5da619c026e1bfbb64325ed122323b8854 + ) + FetchContent_MakeAvailable(XGPU) +endif() + + +# Add src +add_subdirectory(src) + + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..1b0a548 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,119 @@ +#enable_language(CUDA) + +set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +include_directories(${PSRDada_SOURCE_DIR}/src) + +add_executable(test_write test_write.c) +target_link_libraries(test_write ${PSRDada_LIB}) + +add_executable(test_read test_read.c) +target_link_libraries(test_read ${PSRDada_LIB}) + +add_executable(dsaX_trigger dsaX_trigger.c) +target_link_libraries(dsaX_trigger ${PSRDada_LIB}) + +add_executable(dsaX_filTrigger dsaX_filTrigger.c) +target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) + +# DMH: Has a 'sigproc' dependency, low priority +if(0) + add_executable(splice_offline_beams splice_offline_beams.c) + target_link_libraries(splice_offline_beams ${PSRDada_LIB}) + + add_executable(dsaX_writeFil dsaX_writeFil.c) + target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) + + add_executable(dsaX_splice dsaX_splice.c) + target_link_libraries(dsaX_splice ${PSRDada_LIB}) + + add_executable(gpu_flagger gpu_flagger.cu) + target_link_libraries(gpu_flagger ${PSRDada_LIB}) +endif() + +add_executable(dsaX_store dsaX_store.c) +target_link_libraries(dsaX_store ${PSRDada_LIB}) + +add_executable(dsaX_fluff dsaX_fluff.c) +target_link_libraries(dsaX_fluff ${PSRDada_LIB}) + +# DMH: intrinsics compilation error +#add_executable(dsaX_reorder dsaX_reorder.c) +#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) + +# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: +#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] +# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; +add_executable(dsaX_nicdb dsaX_nicdb.c) +target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) + +add_executable(dsaX_capture dsaX_capture.c) +target_link_libraries(dsaX_capture ${PSRDada_LIB}) + +add_executable(dsaX_capture_thread dsaX_capture_thread.c) +target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) + +add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) +target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) + +add_executable(dsaX_split dsaX_split.c) +target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) + +add_executable(dsaX_merge dsaX_merge.c) +target_link_libraries(dsaX_merge ${PSRDada_LIB}) + +add_executable(dsaX_simplesplit dsaX_simplesplit.c) +target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) + +add_executable(dsaX_fake dsaX_fake.c) +target_link_libraries(dsaX_fake ${PSRDada_LIB}) + +add_executable(dsaX_splitup dsaX_splitup.c) +target_link_libraries(dsaX_splitup ${PSRDada_LIB}) + +add_executable(dsaX_copydb dsaX_copydb.c) +target_link_libraries(dsaX_copydb ${PSRDada_LIB}) + +# DMH: fitsio dependency +if(0) + add_executable(dsaX_writevis dsaX_writevis.c) + target_link_libraries(dsaX_writevis ${PSRDada_LIB}) +endif() + +# DMH: XGPU dependencies +if(0) + add_executable(dsaX_wrangle dsaX_wrangle.c) + target_link_libraries(dsaX_wrangle ${PSRDada_LIB}) + + add_executable(dsaX_testdada dsaX_testdada.c) + target_link_libraries(dsaX_testdada ${PSRDada_LIB}) + + add_executable(dsaX_bfCorr dsaX_bfCorr.cu) + target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + + # DMH: Fix CUBE error + add_executable(dsaX_xgpu dsaX_xgpu.cu) + target_link_libraries(dsaX_xgpu ${PSRDada_LIB}) + + add_executable(cuda_correlator cuda_correlator.cu) + target_link_libraries(cuda_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + +endif() + +add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) +target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) + +add_executable(fil2dada fil2dada.c) +target_link_libraries(fil2dada ${PSRDada_LIB}) + +add_executable(dumpfil dumpfil.c) +target_link_libraries(dumpfil ${PSRDada_LIB}) + +add_executable(dsaX_beamformer dsaX_beamformer.cu) +target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) +target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) +target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) + diff --git a/src/Makefile b/src/Makefile index bbca4e0..0de1991 100644 --- a/src/Makefile +++ b/src/Makefile @@ -63,7 +63,6 @@ dsaX_reorder.o: dsaX_reorder.c $(CDEPS1) dsaX_reorder: dsaX_reorder.o $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1) $(CC) -c -o $@ $< $(CFLAGS1) diff --git a/src/cuda_correlator.cu b/src/cuda_correlator.cu index eb0882c..9d9e66d 100644 --- a/src/cuda_correlator.cu +++ b/src/cuda_correlator.cu @@ -36,7 +36,7 @@ using std::endl; #include "dada_affinity.h" #include "ascii_header.h" #include "dsaX_def.h" -#include "cube/cube.h" +//#include "cube/cube.h" #include "xgpu.h" diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu index 5efcfca..afdda70 100644 --- a/src/dsaX_beamformer.cu +++ b/src/dsaX_beamformer.cu @@ -30,6 +30,9 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ + +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -811,7 +814,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -822,7 +825,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -871,7 +874,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; diff --git a/src/dsaX_beamformer_offline.cu b/src/dsaX_beamformer_offline.cu index 13eab5e..c122d46 100644 --- a/src/dsaX_beamformer_offline.cu +++ b/src/dsaX_beamformer_offline.cu @@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -723,7 +725,7 @@ int main (int argc, char *argv[]) { uint64_t block_out = 15*48*512*256; char * block; block = (char *)malloc(sizeof(char)*block_size); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; uint64_t nbytes_per_out = block_out / nints; diff --git a/src/dsaX_beamformer_passon.cu b/src/dsaX_beamformer_passon.cu index 7c8c254..818c28a 100644 --- a/src/dsaX_beamformer_passon.cu +++ b/src/dsaX_beamformer_passon.cu @@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -721,7 +723,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -732,7 +734,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -743,7 +745,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -809,7 +811,7 @@ int main (int argc, char *argv[]) { uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; int nints = NPACKETS / 16; uint64_t nbytes_per_int = block_size / nints; diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu index deca0f5..01c45e1 100644 --- a/src/dsaX_bfCorr.cu +++ b/src/dsaX_bfCorr.cu @@ -1122,7 +1122,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -1133,7 +1133,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); diff --git a/src/dsaX_capture.c b/src/dsaX_capture.c index d83d8a9..054e45d 100644 --- a/src/dsaX_capture.c +++ b/src/dsaX_capture.c @@ -685,7 +685,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO,"Creating HDU"); - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); if (DEBUG) syslog(LOG_INFO,"Created hdu"); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { @@ -861,7 +861,7 @@ int main (int argc, char *argv[]) { } else // we received a packet of the WRONG size, ignore it { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); } } timeouts = 0; diff --git a/src/dsaX_capture_manythread.c b/src/dsaX_capture_manythread.c index 06f508a..b9f14bd 100644 --- a/src/dsaX_capture_manythread.c +++ b/src/dsaX_capture_manythread.c @@ -427,7 +427,7 @@ void control_thread (void * arg) { /* * Thread to capture data */ -void recv_thread(void * arg) { +int recv_thread(void * arg) { udpdb_t * udpdb = (udpdb_t *) arg; int thread_id = udpdb->thread_id; @@ -528,7 +528,7 @@ void recv_thread(void * arg) { } else // we received a packet of the WRONG size, ignore it { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); } } timeouts = 0; @@ -953,7 +953,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); if (DEBUG) syslog(DEBUG,"Created hdu"); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { diff --git a/src/dsaX_capture_thread.c b/src/dsaX_capture_thread.c index 3cc0c96..49019be 100644 --- a/src/dsaX_capture_thread.c +++ b/src/dsaX_capture_thread.c @@ -518,7 +518,7 @@ void control_thread (void * arg) { /* * Thread to capture data */ -void recv_thread(void * arg) { +int recv_thread(void * arg) { // set affinity const pthread_t pid = pthread_self(); @@ -604,7 +604,7 @@ void recv_thread(void * arg) { } else // we received a packet of the WRONG size, ignore it { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); } } timeouts = 0; @@ -753,7 +753,7 @@ void recv_thread(void * arg) { /* * Thread to write data */ -void write_thread(void * arg) { +int write_thread(void * arg) { // set affinity const pthread_t pid = pthread_self(); @@ -964,7 +964,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); if (DEBUG) syslog(DEBUG,"Created hdu"); dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); if (dada_hdu_connect (hdu_out) < 0) { diff --git a/src/dsaX_copydb.c b/src/dsaX_copydb.c index 054ee94..7714038 100644 --- a/src/dsaX_copydb.c +++ b/src/dsaX_copydb.c @@ -160,7 +160,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -171,7 +171,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -220,7 +220,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block; uint64_t written, block_id; diff --git a/src/dsaX_fake.c b/src/dsaX_fake.c index e68f19a..662ea37 100644 --- a/src/dsaX_fake.c +++ b/src/dsaX_fake.c @@ -175,7 +175,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -186,7 +186,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -235,7 +235,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t npackets = block_out / 4608; char * block, * output_buffer; @@ -257,7 +257,7 @@ int main (int argc, char *argv[]) { fread(packet,4608,1,fin); fclose(fin); - syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + syslog(LOG_INFO,"Read packet, npackets %lu",npackets); for (int i=0;idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_merge.c b/src/dsaX_merge.c index 0154b80..7866d5f 100644 --- a/src/dsaX_merge.c +++ b/src/dsaX_merge.c @@ -255,7 +255,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -266,7 +266,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -277,7 +277,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_in2 = dada_hdu_create (); + hdu_in2 = dada_hdu_create (0); dada_hdu_set_key (hdu_in2, in_key2); if (dada_hdu_connect (hdu_in2) < 0) { syslog (LOG_ERR,"could not connect to input buffer2"); @@ -455,7 +455,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block1, * block2, * o1, * o2; char * output = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_nicdb.c b/src/dsaX_nicdb.c index 65cfdcc..df47ebe 100644 --- a/src/dsaX_nicdb.c +++ b/src/dsaX_nicdb.c @@ -369,7 +369,7 @@ int main(int argc, char ** argv) // DADA stuff - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -408,7 +408,7 @@ int main(int argc, char ** argv) // get block sizes and allocate memory uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out); + syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out); uint64_t bytes_read = 0; char *output1, *output2; output1 = (char *)malloc(sizeof(char)*block_out*bdepth); diff --git a/src/dsaX_reorder.c b/src/dsaX_reorder.c index ed0b440..04955da 100644 --- a/src/dsaX_reorder.c +++ b/src/dsaX_reorder.c @@ -369,7 +369,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -380,7 +380,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -435,7 +435,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_reorder_raw.c b/src/dsaX_reorder_raw.c index d1a7ca3..c0f6b0c 100644 --- a/src/dsaX_reorder_raw.c +++ b/src/dsaX_reorder_raw.c @@ -28,6 +28,9 @@ #include "dada_def.h" #include "dada_hdu.h" #include "ipcio.h" +// Forward declaration to keep compiler happy +// Possible minor bug in PSRDada +int ipcio_check_pending_sod (ipcio_t* ); #include "ipcbuf.h" #include "dada_affinity.h" #include "ascii_header.h" @@ -391,7 +394,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -402,7 +405,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -414,7 +417,7 @@ int main (int argc, char *argv[]) { } if (bf) { - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer2"); @@ -501,7 +504,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_simplesplit.c b/src/dsaX_simplesplit.c index fb41432..7a80c7e 100644 --- a/src/dsaX_simplesplit.c +++ b/src/dsaX_simplesplit.c @@ -193,7 +193,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -204,7 +204,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -216,7 +216,7 @@ int main (int argc, char *argv[]) { } if (bf) { - hdu_out2 = dada_hdu_create (); + hdu_out2 = dada_hdu_create (0); dada_hdu_set_key (hdu_out2, out_key2); if (dada_hdu_connect (hdu_out2) < 0) { syslog (LOG_ERR,"could not connect to output buffer2"); @@ -298,7 +298,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * o1, * o2; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_split.c b/src/dsaX_split.c index d5724cd..1361e86 100644 --- a/src/dsaX_split.c +++ b/src/dsaX_split.c @@ -135,7 +135,7 @@ void calc_stats(char *input) { } for (int i=0;idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); uint64_t nints = block_size / block_out; - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * o1, * o2; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/src/dsaX_splitup.c b/src/dsaX_splitup.c index 3a9ab10..32f055d 100644 --- a/src/dsaX_splitup.c +++ b/src/dsaX_splitup.c @@ -160,7 +160,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -171,7 +171,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -220,7 +220,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t nsplits = block_size/block_out; char * block, * output_buffer; diff --git a/src/dsaX_store.c b/src/dsaX_store.c index de53134..849c27c 100644 --- a/src/dsaX_store.c +++ b/src/dsaX_store.c @@ -112,7 +112,7 @@ int main (int argc, char *argv[]) { // open connection to the in/read DB - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to input buffer"); @@ -167,7 +167,7 @@ int main (int argc, char *argv[]) { char fnam[100]; - syslog(LOG_INFO, "have ngulps %d, blocksize %llu, bout %llu",ngulps,blocksize,bout); + syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout); // main reading loop @@ -202,7 +202,7 @@ int main (int argc, char *argv[]) { // for exiting if (bytes_read < blocksize) { observation_complete = 1; - syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu", bytes_read, blocksize); + syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize); } // close block for reading diff --git a/src/dsaX_trigger.c b/src/dsaX_trigger.c index 26342a4..9592389 100644 --- a/src/dsaX_trigger.c +++ b/src/dsaX_trigger.c @@ -186,11 +186,11 @@ void control_thread (void * arg) { //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); specnum = tmps; strcpy(footer_buf,tbuf); - syslog(LOG_INFO, "control_thread: received command to dump at %llu",specnum); + syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum); } if (dump_pending) - syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %llu",tmps); + syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps); if (!dump_pending) dump_pending = 1; @@ -341,7 +341,7 @@ int main (int argc, char *argv[]) { // open connection to the in/read DBs - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer"); @@ -352,7 +352,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output dada buffer"); @@ -525,9 +525,9 @@ int main (int argc, char *argv[]) { // DO writing using thread docopy = 1; - syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); + syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); ofile = fopen("/home/ubuntu/data/dumps.dat","a"); - fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); + fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); fclose(ofile); dumpnum++; @@ -539,7 +539,7 @@ int main (int argc, char *argv[]) { // if trigger arrived too late if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) { - syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum); + syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum); bytes_copied=0; dump_pending=0; @@ -550,7 +550,7 @@ int main (int argc, char *argv[]) { } // update current spec - syslog(LOG_INFO,"current_specnum %llu",current_specnum); + syslog(LOG_INFO,"current_specnum %lu",current_specnum); if (block_count < skips) { block_count++; } @@ -561,7 +561,7 @@ int main (int argc, char *argv[]) { // for exiting if (bytes_read < block_size) { observation_complete = 1; - syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size); + syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size); } // close block for reading diff --git a/src/dumpfil.c b/src/dumpfil.c index 0e658a5..0be913c 100644 --- a/src/dumpfil.c +++ b/src/dumpfil.c @@ -202,7 +202,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -236,7 +236,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input block size %llu\n",block_size); + syslog(LOG_INFO, "main: have input block size %lu\n",block_size); uint64_t bytes_read = 0; uint64_t npackets = 1; char * block, * output_buffer; diff --git a/src/fil2dada.c b/src/fil2dada.c index c2235ec..c49f2b5 100644 --- a/src/fil2dada.c +++ b/src/fil2dada.c @@ -94,7 +94,9 @@ void get_string(FILE *inputfile, int *nbytes, char string[]) } */ -/*int read_header(FILE *inputfile) +int read_header(FILE *inputfile); +/* +int read_header(FILE *inputfile) { size_t nRead; char string[80], message[80]; @@ -353,7 +355,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -364,7 +366,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -413,7 +415,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; uint64_t npackets = 1; char * block, * output_buffer; @@ -431,17 +433,19 @@ int main (int argc, char *argv[]) { syslog(LOG_ERR, "cannot open file - will write zeros"); } else { - - if (rhead) read_header(fin); -// fread(packet,block_out,1,fin); -// fclose(fin); -// syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + // DMH: FIXME + //if (rhead) read_header(fin); -// for (int i=0;idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block, * output_buffer, * blockie; output_buffer = (char *)malloc(sizeof(char)*block_out); diff --git a/utils/packet.out b/utils/packet.out index 435ed746680340c61a20bff1c7e2a38931ec8d31..de3b9a47bdebd485332ff433d1644d6d6ff77d33 100644 GIT binary patch literal 4608 zcmWkyyPMoNaxbvu#5=^JXB~M6kgZfr&TOt3uP?Pv=l{^(=iJs^Zr@10a96|by-?Ex zT6@u|fdYFSZ4>319K@sQ_bLJm7ny4)ql5q}+Nd(3EcGN3C6vm^lnbJ9<0H6gDSZM< zsIhDmWRZd14u!vsy^l&SS;?eYojN1S+L)Fy@bys#@$q?w4&11cS4K!bJ+uu$jU|m(*0U&jG*xjYrGd(I2{ZAaEj~0 zfmFK5d@-`MUuICk%e5r6K8MvXPDp0TEWRpWI{Wv zv87<791j&AkhE@sR*OCmeO6;3)fHGC2WSycEj}NdBaM@z_Fnv!y{faXp@ptn!z4fq z)d|PH0SYGqh<0m@@V@>c{l})%W_cv=x{7TEC!T~Dr5Hw9rlip4Hl!lxo?6w<-_)rA+5;!vNJmmUyW%D9jMa+f^^oW)?r+mIsd3jp6#jh)M|VFd!cH^;O)aM z&Z<7B3UCb6p0Bb~)qxh=0fxL6igqy$V-Z&7W4k4B>`p0TUdV$canLln;IGKU$~g6( zc9Ge`snSXtC6t+u@xg}~*KO&t!xSG~RHtWYQs73{m<~||U+mKum7;it`?=P|!6}Eo z(texP`83mX)wnk4iTkF-X6?%bty}80x#T7tkLz?@qzN>y^Naz;R5!mT2e!72*?di6 zubidSsl}~(LQj5HsOQ4>e`t^?Rhu+VTpR|qc6}F`SGb#-M6*MF9y1C{`YyeCli}G9 z4UUL5hf9KnRL$Q{KThb@@;dONC(IIan{%wFm0uEO0*Y~`e&?5ORZuE%Q-l#%g^!ss zAPV{G66^H39u(5hVIIPTS_6j03SGVE5^nUX7@snWM`r3(B{ zb3na&J_ofeaUjj5bc;8s%BN~yPvn$rNob4SUz%N2AI9OHiJ7`Xc5c@Cgwm~BW}Yai z%O$NzxIF6g1Q8=_gx1%6sY6EFrk0<2qjV!YQ@{5^E~5Zo0Jnc*!sV{b!JWS#)YBu< zCUu0Q_2&Sr#|%<#@MSu2nW6uH%x8a6 zxI04B*M!@rw4^HJ+LoEPyNjaB)yHl?H?o3_Yu=W(a5uKQ1f%yXf=WES$sMj>6eQ?} z-I@TdD~%LDYuGFh;aD1F|9L%%_8KXTHdcD0+ZfJw+Dn;y>p&D>QqjiSS@M}nLrIS? z?8a|da_y~(_++q5a(+?>vZ`+_Gb(P)Y$%L5EyJv;dl6oX1^IvnUay6S-}NOAnF!^( z;Z(HD3}Intakjn{A@_P!xn~oG6PE#EF&$p-9jcB&LKtPaKlzBQ#?G&k=+3e#Iv)QujCw>T zLBGEivDb{%^u4I7;Sv9#;wK8ktQjo7be|IQ#$<6sd6T_ceMnxz{gbuqI0W63VQhGR zT<0WQ$k#8dsZYCv6xrR_eY2V~1$6+nXnQNoC&RX}(p{YWyk(Ni?_B555*+_P3>xL~ zWTVQrm$fL*eQGAuH2b4%;5v0pjZ!~CR!65N^3P2-Mau@N^E&|ws=Q7{Ib6LmxYS~@ zg#|w=S1aEvLM4?%E5h?#J-~pP37P~|q@KS`cxJ4}#=&y5JPc9nbG#>H+zAyLp^Y0!*k?F!=3^#6K3x9Y93j%jhe>39NomIjyGQPQS{5s z4)gTE(s&i(+~R0UCA{$4ujTe&blWmZ-!q(M;Idd9SdtnG_W(%~R$2h^AB!% zqrNLtM ze%wl3e`%BSHjXdp-5JNAc2$;s+eAN8@hTkpc$pJ_Sr+D>hq@Oz z#I=JP+0IV`>wQL6qc?FC>G)4+7`{5Wre9jfeSy3`Zj#*notFi|0=anEiO5>1lrHN$LSO z9;N4q33|i(s}`iyF`_r!zAWWRS+dK&DUv((>;bV!Z>C}&&p($>z8l7Nd`>4(nunv4 z$R?(p0q>LY++cmD)*0JbEf2Isua<>*%I$hwKQQELrSfusLDHNRFgVtndj`=)T!?ml zKJ}w6kRHH)j01#f(Mv>Dz$Y~h2li<7n_`YHz$r35eaP~^aPFC39LCZHkMfqG)ESXO z@zo1ldSK3FZW{T|_H_k{WcsX{gA zbKHVC(UM1R#N*WlR&qLr{!TE?#YUIQ9Kg*ska56v#1Vo45h^7`xJadCrL0z&O)ff=DgoAj^G1e5=%@?Ws-GQyVrsZK-ELLE! zQ#>?sg!k&WH;uy5=5_N)?ofOk&wrQT7YZVpHWfqLj$$?3opd=Z?Xgb6cdo zmG2*UPS|9b-eytmAWsbDX69F-@WU5(O$VBs}K7+srx>NzBx%GHqxKpbkWovXFCctch@@VjN>Sr$3Xrm4#GT>ga^ znr$v-(Cla(7IcroTXXow<>eQF{ge_}Vl-C2m1~1jeOv=&Hw1@A^!Gf3lGirpjOQNd zKTZqk4qMKd@2Qe3UfCCcqT+#@3Q|uGoBV-j4U?!d$Oahv^; zJIa65g=6cGNNSK5;+SL&P3*e0(rlhC@f?!15?g%-p@91WgDc3x=7NWhj&-4j@1>yma~b604X~wTV!kAG8xIvI>&w|`~0}~3*sc(oR=AA>ru1i z?T7?g$!dl`f%48GkeaR3-9#1Ye)kqX#TW*vicUaU5l(-F+FbiiA!RJ2MX+}%0G(V# zCn)dgl9er$k7>L@^KdwI@rhbYlZH|&;7HzJu2-!aQd@5!6Xh2>I zfU^9G(<^7!vaYW|6R zwTwrR0-}h@u1KXZdj0mA?+j?oUDUOQ)bDST%VzT+p0M|aOmF5`vePQ}B@IF9QE;=J z&8yJ?T!%nG@5~)BukbH?RjQ1q z^RV4kl|5`VPww^8@D$@|8ovguF|{^C@mnY0Hb$yemC}{cZ+WW1g%Yx6;j+A)xVtPFKZF@QC`;-Jm#c4)P3P&kEUMKgmbargZ?u^rh?2W7%knygWJ~@KOq-x2Dlv7rGD8hlm04XR z2`=UH(5*3{{XCFgltZ!aA3WsD$L8O`Y%y%|PwjbBqDfcqLs>3R$%-zq9+t6Ht?JCS zbYek8Zj2Jel@!17c-xAVvT&)ceKbKqELvWN5vVRSV3kAnXrB#Uez4O{?$!Nz1`~`@ z<4y^J{cLGV;4V+I|2BV1=MqUbyVi_H^ZS*kD?yjv4_qy!cY>;yV0g#}sjPVG-Z{Rc zZfuDwt{oIwe_W+6hEhvBiI_#Sw_4x&k5=%Z7dK1DMI;b5&-?vg?(?|s+!f86=GhWn z47bHq!YQj-e?w4a;%Uo{!uB?;B|lfLu|i-k9-4fRSL9u9C->)w@)J7cypsSgmP?SW zie=)dRWbfFWqUSl6!`ZJ9}3RD1E_gX{IT&&7oE?x8eRQz(!=fUmKqi*W!IMI^h71) zBF~HF5&A9`wU1)k^4&8P1~ld{s7ixPE{Wu`5s<5eRVuuM_5lyhwV2`{2_ z4(#3yZs2y$*2h6Sn4kAE z-2YcmW$(I5T9wr|lqs&+GYezHmmztfPA9wXtz>&FJ!b@a>m0t6)?vL;wMnwEp~NGz zlT3k7B0J_Ta9L##q$3-;cLgY_kz#QP*N+k|cCHU9+da+UWbry+l(iGIqemOH&DOt* z%%y`=6UcNqDpP|jMi!M;;#+`_B_zlorC0?t*{V{ecwc@-Hwtqp_0Of8$#vXoSfzZb zle54Q&gCz((91=8RyGTNpbfA*J9*DfW9eUo*vL}2K-cNg-L=sny+nO4B!`gH9toGye0V1eR=h4J!7RlnmSerks8#)CW+Wtp2-c;(gywnw$Ma)|1RjHGY|$IO{u9J4T3nd|w_+uT38Glqg=KBpzyE z#V%W!lVn_Z92oD%*tCbhl(=QTh-<`~XBHE>BxlY{Q*m2lE8wO;6P5DW7fIztp(o9E zWMxB0kGIb(Yha10lVy>b+I2Iba&XRvRDaV|L&kJ1gfH(39RZy)IVU)_!92^Kw5SQS zl5;X>xfdi40Cx@*RO*8X*p)vP#m5&#*A*%DfcYUY_qE*@L7j+e;t8*^0rw zZc}iA@vZHSAjNc-wt!=ZkGTauiv6m!m{n>G1@gW;{_udxrVcD;7tO7?PSy6k?BS%C zz2@kOlf^aOCXM-erU$XEkPfoa ztHskx0*GI6rx}S-Ybcz-0<##Wt6h7{6#k|C#62BV*njeRy%c^?lB!NBWnrT_WX0d> z6pZ633Z}rg=A|CYC)T-minL0Rx5psaTC?Kv`yrmB9Ic42c!2rJSILBQ)pT&p@Mb(e z_R_$RZag2sejuiZGM_loQ>FGBzH5<$RWhBabvj}TcHb~)x%lxs7iZ^xfD(} z_^!X*e$svgIGK`9p+fP}-GGZdsEnfU$DP_=Z+-rnID9A;AGvu4U|^|qVG^)295n|- z;W%awDdd_o?+RA3x-X^7rtUaOvfk%M7*)&pUlgsvzy^ICS2ueOQ9P6OEpT{DD&nA# zjMM4e;2nF;D#`;~I`d>ODt0J~GqSZY@{&-=ol6GIYsDmaNpKiAQQ6T!kP2zLdP4&vL8j=d(doBs!KKhh72&=gACa3HDey)xlBE-!- zb4MJ|;GBOY)kZyki=9xGuTva%3=FMISdK7rYaMTC%;oI$X)VG^ zKkTD!)X-}{F_o=igz$y1vQ?d+kva30M`mZOxo@}W{!F&zUAW~Y6?w5Ge-x>Q}K(JV&mjnaHxys+c~PbA1`aMVQyM&wO(Jg zhRJ^ru{Zy&$2mdeeH%8KHJ8;-p=JxWvMnw`^Ffj>A?FqgLFHNsnVc3TT|YvQTR1J#KE zyB|OQu@%P|pt50YKsTFh~?E@NKWfN_-dgKn+XD$|hBrkfpIXj+h zFCg+l#*0l6_POcqtc_H{@gtqq~qG zEajk-R`+ZM^y7+{xUG2KF&M=^aLDFY5_2lNqq3oj{lJg<{4|DcF)QGDa4D64tyIh1 zOvP|_Y$P{W!6<4I3XG*&zbp$0A4-ew#PfWFL6~I?5Odx$85fEKzj9J-`8~-Ao|)t7 zYTvpVs>1YPI`0!0YhQ9I;g6#i9?nXYRlVgG0pCR-w$zMOzX&R3zb-BrcM}`lJP!L* zKC#$$nxh2Tnqxelc;!{rRZ+Qbk4ksGj1=lYPNu~d)mSo8toBsz)I=hts From 693803ae25199b5a3edd87a06402e720765d314f Mon Sep 17 00:00:00 2001 From: cpviolator Date: Tue, 4 Jun 2024 19:36:00 -0700 Subject: [PATCH 02/30] Add XGPU dependency, used local fork for now --- CMakeLists.txt | 10 +++--- src/CMakeLists.txt | 33 ++++++++++--------- ..._correlator.cu => dsaX_cuda_correlator.cu} | 5 ++- src/dsaX_testdada.c | 2 +- src/dsaX_wrangle.c | 6 ++-- src/dsaX_xgpu.cu | 12 ++++--- 6 files changed, 37 insertions(+), 31 deletions(-) rename src/{cuda_correlator.cu => dsaX_cuda_correlator.cu} (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66682b6..f3e491c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,18 +108,16 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA) FetchContent_MakeAvailable(PSRDada) endif() +# Get XGPU dependency option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON) if(DSA_XENGINE_DOWNLOAD_XGPU) FetchContent_Declare( - XGPU - GIT_REPOSITORY https://github.com/GPU-correlators/xGPU.git - GIT_TAG 7e85bd5da619c026e1bfbb64325ed122323b8854 + xGPU + GIT_REPOSITORY https://github.com/cpviolator/xGPU.git + GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 ) FetchContent_MakeAvailable(XGPU) endif() - # Add src add_subdirectory(src) - - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1b0a548..de025f6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,11 @@ #enable_language(CUDA) -set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) include_directories(${PSRDada_SOURCE_DIR}/src) +include_directories(${xGPU_SOURCE_DIR}/src) + +set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + add_executable(test_write test_write.c) target_link_libraries(test_write ${PSRDada_LIB}) @@ -80,24 +84,21 @@ if(0) endif() # DMH: XGPU dependencies -if(0) - add_executable(dsaX_wrangle dsaX_wrangle.c) - target_link_libraries(dsaX_wrangle ${PSRDada_LIB}) - - add_executable(dsaX_testdada dsaX_testdada.c) - target_link_libraries(dsaX_testdada ${PSRDada_LIB}) - - add_executable(dsaX_bfCorr dsaX_bfCorr.cu) - target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +add_executable(dsaX_wrangle dsaX_wrangle.c) +target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${XGPU_LIB}) - # DMH: Fix CUBE error - add_executable(dsaX_xgpu dsaX_xgpu.cu) - target_link_libraries(dsaX_xgpu ${PSRDada_LIB}) +add_executable(dsaX_testdada dsaX_testdada.c) +target_link_libraries(dsaX_testdada ${PSRDada_LIB}) - add_executable(cuda_correlator cuda_correlator.cu) - target_link_libraries(cuda_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +add_executable(dsaX_bfCorr dsaX_bfCorr.cu) +target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) -endif() +# DMH: Fix CUBE error +add_executable(dsaX_xgpu dsaX_xgpu.cu) +target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) + +add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) +target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) diff --git a/src/cuda_correlator.cu b/src/dsaX_cuda_correlator.cu similarity index 97% rename from src/cuda_correlator.cu rename to src/dsaX_cuda_correlator.cu index 9d9e66d..3bebd09 100644 --- a/src/cuda_correlator.cu +++ b/src/dsaX_cuda_correlator.cu @@ -1,6 +1,8 @@ // -*- c++ -*- /* will run xgpu */ /* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -222,7 +224,8 @@ int main(int argc, char** argv) { #ifdef RUNTIME_STATS clock_gettime(CLOCK_MONOTONIC, &tic); #endif - xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp); + //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp); + xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp); #ifdef RUNTIME_STATS clock_gettime(CLOCK_MONOTONIC, &toc); #endif diff --git a/src/dsaX_testdada.c b/src/dsaX_testdada.c index c12d704..bbe7640 100644 --- a/src/dsaX_testdada.c +++ b/src/dsaX_testdada.c @@ -114,7 +114,7 @@ int main (int argc, char *argv[]) { } // DADA stuff - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); dada_hdu_connect (hdu_in); diff --git a/src/dsaX_wrangle.c b/src/dsaX_wrangle.c index 5825ec6..19507d4 100644 --- a/src/dsaX_wrangle.c +++ b/src/dsaX_wrangle.c @@ -217,7 +217,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -228,7 +228,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -277,7 +277,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); uint64_t bytes_read = 0; char * block; uint64_t written, block_id; diff --git a/src/dsaX_xgpu.cu b/src/dsaX_xgpu.cu index a64217b..d065848 100644 --- a/src/dsaX_xgpu.cu +++ b/src/dsaX_xgpu.cu @@ -1,6 +1,8 @@ // -*- c++ -*- /* will run xgpu */ /* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + #include #include using std::cout; @@ -177,7 +179,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -188,7 +190,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } - hdu_out = dada_hdu_create (); + hdu_out = dada_hdu_create (0); dada_hdu_set_key (hdu_out, out_key); if (dada_hdu_connect (hdu_out) < 0) { syslog (LOG_ERR,"could not connect to output buffer"); @@ -283,7 +285,8 @@ int main (int argc, char *argv[]) { cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice); promoter<<<6291456,32>>>(d_din,d_dout); - xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); xgpuClearDeviceIntegrationBuffer(&context); } @@ -315,7 +318,8 @@ int main (int argc, char *argv[]) { cudaDeviceSynchronize(); // run xgpu - xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); if(xgpu_error) { syslog(LOG_ERR, "xGPU error %d\n", xgpu_error); return EXIT_FAILURE; From bb142b1b6f0dc1196bca68d2d0815f4718e2f23c Mon Sep 17 00:00:00 2001 From: cpviolator Date: Wed, 5 Jun 2024 15:46:01 -0700 Subject: [PATCH 03/30] mid cmake upgrade --- CMakeLists.txt | 36 +++++++++++++++++++++++++---- src/CMakeLists.txt | 55 ++++++++++++++++++++++++++++++++++++++++++-- src/dsaX_bfCorr.cu | 40 +++++++++++++++++++++++++++----- src/dsaX_dbnic.c | 4 ++-- utils/gen_packet.py | 12 ---------- utils/packet.out | Bin 4608 -> 4608 bytes 6 files changed, 121 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3e491c..ae509fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,8 +55,8 @@ set(DSA_XENGINE_GPU_ARCH_SUFFIX CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ") #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH}) -mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) -mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) +#mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) +#mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE) list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) @@ -103,7 +103,7 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA) FetchContent_Declare( PSRDada GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code - GIT_TAG 008afa70393ae2df11efba0cc8d0b95cda599c02 + #GIT_TAG 008afa70393ae2df11efba0cc8d0b95cda599c02 ) FetchContent_MakeAvailable(PSRDada) endif() @@ -114,10 +114,38 @@ if(DSA_XENGINE_DOWNLOAD_XGPU) FetchContent_Declare( xGPU GIT_REPOSITORY https://github.com/cpviolator/xGPU.git - GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 + #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 ) FetchContent_MakeAvailable(XGPU) endif() +# Get TCC dependency +option(DSA_XENGINE_DOWNLOAD_TCC "Download and build TCC" ON) +if(DSA_XENGINE_DOWNLOAD_TCC) + FetchContent_Declare( + TCC + GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator + #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c + ) + FetchContent_MakeAvailable(TCC) +endif() +add_custom_command( + OUTPUT "file.txt" + WORKING_DIRECTORY ${TCC_SOURCE_DIR} + COMMAND "sed -i 's/libtcc\///g' libtcc/*.h libtcc/*.cc" + ) + + # Add src add_subdirectory(src) + +# Install project cmake targets +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${PROJECT_NAME}-config-version.cmake + VERSION ${DSA_XENGINE_VERSION} + COMPATIBILITY AnyNewerVersion + ) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} + ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index de025f6..54467d7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,7 +4,7 @@ include_directories(${PSRDada_SOURCE_DIR}/src) include_directories(${xGPU_SOURCE_DIR}/src) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) -set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.so) add_executable(test_write test_write.c) @@ -50,6 +50,9 @@ target_link_libraries(dsaX_fluff ${PSRDada_LIB}) add_executable(dsaX_nicdb dsaX_nicdb.c) target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) +add_executable(dsaX_dbnic dsaX_dbnic.c) +target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) + add_executable(dsaX_capture dsaX_capture.c) target_link_libraries(dsaX_capture ${PSRDada_LIB}) @@ -85,7 +88,7 @@ endif() # DMH: XGPU dependencies add_executable(dsaX_wrangle dsaX_wrangle.c) -target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${XGPU_LIB}) +target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ) add_executable(dsaX_testdada dsaX_testdada.c) target_link_libraries(dsaX_testdada ${PSRDada_LIB}) @@ -118,3 +121,51 @@ target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) +# install step for header files +set(DSA_XENGINE_HEADERS + # cmake-format: sortable + dsaX_capture.h + dsaX_capture_manythread.h + dsaX_capture_pcap.h + dsaX_def.h + ) +install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) + +# install step for executables +install(TARGETS + # cmake-format: sortable + dsaX_beamformer + dsaX_beamformer_passon + dsaX_xgpu + dsaX_reorder_raw + dsaX_fake + dsaX_capture + dsaX_capture_thread + dsaX_capture_manythread + dsaX_dbnic + dsaX_nicdb + dsaX_split + dsaX_wrangle + fil2dada + dumpfil + dsaX_simplesplit + dsaX_store + dsaX_trigger + dsaX_filTrigger + dsaX_beamformer_offline + dsaX_splitup + cuda_correlator + dsaX_copydb + dsaX_bfCorr + dsaX_merge + + #fitsio dep + # dsaX_writevis + + #sigproc dep + # dsaX_writeFil + # dsaX_splice + # gpu_flagger + RUNTIME DESTINATION + bin + ) diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu index 01c45e1..25b9262 100644 --- a/src/dsaX_bfCorr.cu +++ b/src/dsaX_bfCorr.cu @@ -45,7 +45,7 @@ using std::endl; #define sep 1.0 // arcmin /* global variables */ -int DEBUG = 0; +int DEBUG = 1; // define structure that carries around device memory typedef struct dmem { @@ -264,8 +264,34 @@ __global__ void transpose_matrix_float(half * idata, half * odata) { } +// arbitrary transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +template __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) { -// function to copy amd reorder d_input to d_r and d_i + __shared__ in_prec tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + + +// function to copy and reorder d_input to d_r and d_i // input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. @@ -1181,7 +1207,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); if (bf==0) syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); else @@ -1209,6 +1235,7 @@ int main (int argc, char *argv[]) { // do stuff //begin = clock(); + // loop if (bf==0) { if (DEBUG) syslog(LOG_INFO,"run correlator"); dcorrelator(&d); @@ -1226,7 +1253,8 @@ int main (int argc, char *argv[]) { cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; // write to output - + + // write to host written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); if (written < block_out) { @@ -1237,13 +1265,13 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); blocks++; - + // loop end // finish up if (bytes_read < block_size) observation_complete = 1; - + ipcio_close_block_read (hdu_in->data_block, bytes_read); } diff --git a/src/dsaX_dbnic.c b/src/dsaX_dbnic.c index 40407ee..83e3e4a 100644 --- a/src/dsaX_dbnic.c +++ b/src/dsaX_dbnic.c @@ -261,7 +261,7 @@ int main (int argc, char *argv[]) { syslog (LOG_INFO, "creating in and out hdus"); - hdu_in = dada_hdu_create (); + hdu_in = dada_hdu_create (0); dada_hdu_set_key (hdu_in, in_key); if (dada_hdu_connect (hdu_in) < 0) { syslog (LOG_ERR,"could not connect to dada buffer in"); @@ -294,7 +294,7 @@ int main (int argc, char *argv[]) { // get block sizes and allocate memory uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size); + syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size); uint64_t bytes_read = 0; char *block; uint64_t written, block_id; diff --git a/utils/gen_packet.py b/utils/gen_packet.py index 8803832..2ae1bee 100644 --- a/utils/gen_packet.py +++ b/utils/gen_packet.py @@ -214,15 +214,3 @@ def histo_test(data): #plot_spectrum(out_str,pol=1,ant=1) - - - - - - - - - - - - diff --git a/utils/packet.out b/utils/packet.out index de3b9a47bdebd485332ff433d1644d6d6ff77d33..34e6909992a277b32cd475dc5c7f9f04da910749 100644 GIT binary patch literal 4608 zcmW+)&v)E5k}go*Q$+me%x&EO*`C{4r0iT1wY}%v&3pTodiT8ZA~|PEWarH(RuiCd z8#hp3=VlSLe4S(FWz+KX5=k0#xq zA)|sjI#5k2{iJ=yq++YCR0dKIVIz(?m~kZogI6z&!a&zbZ_VX!pkwSO$i?_`U7D?Q zO6%YMTb-6B32}ihoNJcxl&B%ce@x}Pn3oTga+5A|*b1zJnj55|on7qw&Rlh;Q+HJR zojoghQ;sU#$lqxTlvKGwe5ocJ$&$6|KKn7C2Ev|MHx<4V3Hm{h)y?Kp&FueZL=#d6AId+dFrQ8 zqqFDwx?}{*DktmHlwLSX2+;Uv#8DNQJSg+ppHWyp5~0FbpJ-B|PZ`6Sq6S4!G;e&; z%I6@`3L@Z9Ku)5xd#t9$8d7nDfc>{`Wn;lZpMP{kOx+Rg^2#-?B9ULP@JaQrUixR@ zm*7p~f58eQ!P$b$i0<3AVzae6Lwnb(@msxf8IRD2h1yf!M{7jvbKxdZc<)l`BuW*g zbI!YGo$jxRmdi|z4E@nK6&@2}1PD#HT>u$bM7p?pWeWdBk3s-hFizHtv#;2M29B3^ zdLumMp-qjmsfeB7H=EQ;{MxzpUq|rzWS#JQxu9!whL@BY89ELG(&uVN>)UU?_sXH~C6B5@Wmki{`lB=a#w}MMYrezVvywXohr0YSlqqEe^+u@ZYMwlr zrLa>0TXH_5Q-bmQR1Mw`(;pb+o@^BrxaAsMr+ej*0IT0I=E3kv5}`Q4RfCIm^rR=F zm9;x7Rgz!_9%$%%hpeF?dd6sF6@?*l4**itB0%sN3O<7lY&tzQ}WBRqmN{YVYqKsJysaXubTB_Q4v7m^U&+8pNV$ zP^6s6Nl|Mpx5tZG4hrPa9(W9~;VFhqes6H9;M!AK7NRNFgGEzbBN zJqa<(e#qauBg>pf*zqtKVRzIjn_N-21+KqiP7=5MO3x#{=A zBFmyHz$CPaN;S66>70Al$)AoU)R;#JmHsui($&nILH$?buqVhIe=FurX=I93-)gv zmZty*?!M^l5q8c2@uf_>mGQGa9<+GLcag^dPi~uf)xQtU5t#b-TsEj-5a;aw3F^9Y zN*V!;-b*N?RGD%_BjM;+*LfzAR1e`2Gl@ynu0h2p^5w;m7pXD44loY(_cxOz|RU^bv=!=1iR1G`V6vm z2~t@iZKr%yms;WZx^&)@yAa6sl40)fL({ri$KqaoQb%OTqPQ3qu;--h-!g-wwM5nK z2dr{Y`?|4^$eFEFzT55DbdsxlFb+Fx2RQ6;d6y0uTe;22_ZZBo>dLX?PYy?8wNK}u z@8z|Zte$RmzGIRn+A*W5)rywggL{pvumhVI5spX^x`C$@_>eRAB?clUc)bv}^Z0qd zXtZp*CKg?gU*~&@PNRXozbi2CFjT+dhn_hDhGF=d{7$pHZPXAb zZ0pNh>m{IaO~O1$o_a+Ue-w6eIWtL8UJF~S_AUV6IfhYi$M3bPWH7N8Exfr?pNM}~ z|HJFB%Ds3s>$tCZ4Ytx|krLBMQ2O=WxZ_kRK|A4u{*8QHa23ftgYFBv#r(8nIL++s zdo5gT$4_ryKhhjce)8-Zfsf5H-z^Nr9~rMM=TWnfq)R;E1e?RH=u})hm?5$KTaV^h z3+Ctz-!%bvHeB_Z|If*S#SRuc6k3@r5xC8peTmD2N_eYhrWVwRG{ z=06&j)ui)8%mjG>|4QP)QPs9(cvwQtpPbk~eLG5Fl=pJ+ABANjW?_DuWWa_+8(8+g z=re@+fXkS#@@r)$c8c(ybo)4q^X#1l`SkiP2H*HJa_w%rE4vrTPE zP4X()N|>5|#~QdcL$u`YuhUvxkvJx49WRmzQ`om9;e zq#w5%v9rJmUT1#*p6PR8z%kc# z!$CuJTr!>%n(KXWo;!hYD=!rsZzr;1eweo#D!wR4%p6?^u$mx48w=sACQQxXfW^cwa>~E#w7$H=ORS0+sIwL!cznp4U z;%2mRCNT-XU<3|f@bXAQZ#fveu?@D6O%& zyk4qS%+`mMg4J&I^(Zu3Lw2AKTu7%w%#Nv%DM$*1-n}3716if`Uj*G((9nFEf-ZEp zDPo|V#tT76e-x7S1NGWn8@}WFfn8$D^swxC+^VX_V(HzhlG>YJ7=y?%Ahh@KEN8aU zP2@D@FVrfzrmWa*<=ThDE&$SBsU6v!{nMDS*1;0x8<7&4M?kiW4(|1Q(WpL5D!?WC zsW7Xa^LP#I27a>2n21dIARbD1PRu3=qjYtL+_9zEuy$qaext-6DQS6JP Q(8{YQee{VQIaCb)58tM{bpQYW literal 4608 zcmWkyyPMoNaxbvu#5=^JXB~M6kgZfr&TOt3uP?Pv=l{^(=iJs^Zr@10a96|by-?Ex zT6@u|fdYFSZ4>319K@sQ_bLJm7ny4)ql5q}+Nd(3EcGN3C6vm^lnbJ9<0H6gDSZM< zsIhDmWRZd14u!vsy^l&SS;?eYojN1S+L)Fy@bys#@$q?w4&11cS4K!bJ+uu$jU|m(*0U&jG*xjYrGd(I2{ZAaEj~0 zfmFK5d@-`MUuICk%e5r6K8MvXPDp0TEWRpWI{Wv zv87<791j&AkhE@sR*OCmeO6;3)fHGC2WSycEj}NdBaM@z_Fnv!y{faXp@ptn!z4fq z)d|PH0SYGqh<0m@@V@>c{l})%W_cv=x{7TEC!T~Dr5Hw9rlip4Hl!lxo?6w<-_)rA+5;!vNJmmUyW%D9jMa+f^^oW)?r+mIsd3jp6#jh)M|VFd!cH^;O)aM z&Z<7B3UCb6p0Bb~)qxh=0fxL6igqy$V-Z&7W4k4B>`p0TUdV$canLln;IGKU$~g6( zc9Ge`snSXtC6t+u@xg}~*KO&t!xSG~RHtWYQs73{m<~||U+mKum7;it`?=P|!6}Eo z(texP`83mX)wnk4iTkF-X6?%bty}80x#T7tkLz?@qzN>y^Naz;R5!mT2e!72*?di6 zubidSsl}~(LQj5HsOQ4>e`t^?Rhu+VTpR|qc6}F`SGb#-M6*MF9y1C{`YyeCli}G9 z4UUL5hf9KnRL$Q{KThb@@;dONC(IIan{%wFm0uEO0*Y~`e&?5ORZuE%Q-l#%g^!ss zAPV{G66^H39u(5hVIIPTS_6j03SGVE5^nUX7@snWM`r3(B{ zb3na&J_ofeaUjj5bc;8s%BN~yPvn$rNob4SUz%N2AI9OHiJ7`Xc5c@Cgwm~BW}Yai z%O$NzxIF6g1Q8=_gx1%6sY6EFrk0<2qjV!YQ@{5^E~5Zo0Jnc*!sV{b!JWS#)YBu< zCUu0Q_2&Sr#|%<#@MSu2nW6uH%x8a6 zxI04B*M!@rw4^HJ+LoEPyNjaB)yHl?H?o3_Yu=W(a5uKQ1f%yXf=WES$sMj>6eQ?} z-I@TdD~%LDYuGFh;aD1F|9L%%_8KXTHdcD0+ZfJw+Dn;y>p&D>QqjiSS@M}nLrIS? z?8a|da_y~(_++q5a(+?>vZ`+_Gb(P)Y$%L5EyJv;dl6oX1^IvnUay6S-}NOAnF!^( z;Z(HD3}Intakjn{A@_P!xn~oG6PE#EF&$p-9jcB&LKtPaKlzBQ#?G&k=+3e#Iv)QujCw>T zLBGEivDb{%^u4I7;Sv9#;wK8ktQjo7be|IQ#$<6sd6T_ceMnxz{gbuqI0W63VQhGR zT<0WQ$k#8dsZYCv6xrR_eY2V~1$6+nXnQNoC&RX}(p{YWyk(Ni?_B555*+_P3>xL~ zWTVQrm$fL*eQGAuH2b4%;5v0pjZ!~CR!65N^3P2-Mau@N^E&|ws=Q7{Ib6LmxYS~@ zg#|w=S1aEvLM4?%E5h?#J-~pP37P~|q@KS`cxJ4}#=&y5JPc9nbG#>H+zAyLp^Y0!*k?F!=3^#6K3x9Y93j%jhe>39NomIjyGQPQS{5s z4)gTE(s&i(+~R0UCA{$4ujTe&blWmZ-!q(M;Idd9SdtnG_W(%~R$2h^AB!% zqrNLtM ze%wl3e`%BSHjXdp-5JNAc2$;s+eAN8@hTkpc$pJ_Sr+D>hq@Oz z#I=JP+0IV`>wQL6qc?FC>G)4+7`{5Wre9jfeSy3`Zj#*notFi|0=anEiO5>1lrHN$LSO z9;N4q33|i(s}`iyF`_r!zAWWRS+dK&DUv((>;bV!Z>C}&&p($>z8l7Nd`>4(nunv4 z$R?(p0q>LY++cmD)*0JbEf2Isua<>*%I$hwKQQELrSfusLDHNRFgVtndj`=)T!?ml zKJ}w6kRHH)j01#f(Mv>Dz$Y~h2li<7n_`YHz$r35eaP~^aPFC39LCZHkMfqG)ESXO z@zo1ldSK3FZW{T|_H_k{WcsX{gA zbKHVC(UM1R#N*WlR&qLr{!TE?#YUIQ9Kg*ska56v#1Vo45h^7`xJadCrL0z&O)ff=DgoAj^G1e5=%@?Ws-GQyVrsZK-ELLE! zQ#>?sg!k&WH;uy5=5_N)?ofOk&wrQT7YZVpHWfqLj$$?3opd=Z?Xgb6cdo zmG2*UPS|9b-eytmAWsbDX69F-@WU5(O$VBs}K7+srx>NzBx%GHqxKpbkWovXFCctch@@VjN>Sr$3Xrm4#GT>ga^ znr$v-(Cla(7IcroTXXow<>eQF{ge_}Vl-C2m1~1jeOv=&Hw1@A^!Gf3lGirpjOQNd zKTZqk4qMKd@2Qe3UfCCcqT+#@3Q|uGoBV-j4U?!d$Oahv^; zJIa65g=6cGNNSK5;+SL&P3*e0(rlhC@f?!15?g%-p@91WgDc3x=7NWhj&-4j Date: Fri, 14 Jun 2024 16:00:13 -0700 Subject: [PATCH 04/30] WAR for pthread detection --- CMakeLists.txt | 2 ++ src/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae509fb..08a326c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,8 @@ set(CMAKE_CXX_EXTENSIONS ON) # Define the project project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C) +set(CMAKE_C_FLAGS "-pthread") +set(CMAKE_CXX_FLAGS "-pthread") # DSA_XENGINE may be built to run using CUDA. Future version may be # written for HIP or SYCL, which we call the diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54467d7..fbd3f3a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ endif() # DMH: XGPU dependencies add_executable(dsaX_wrangle dsaX_wrangle.c) -target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ) +target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ${CUDA_nvml_LIBRARY}) add_executable(dsaX_testdada dsaX_testdada.c) target_link_libraries(dsaX_testdada ${PSRDada_LIB}) From 4082f9b0fe443698d4e8adcf613000e1f94acc17 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:35:39 -0700 Subject: [PATCH 05/30] Create CMakeLists for automatic linking and building of dependencies --- CMakeLists.txt | 73 +++++++++++++++----- src/CMakeLists.txt | 169 ++++++++------------------------------------- 2 files changed, 82 insertions(+), 160 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 08a326c..d4328d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,9 +17,15 @@ set(CMAKE_CXX_EXTENSIONS ON) # Define the project project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C) + +# For GCC 8 and lower, set -pthread flag manually set(CMAKE_C_FLAGS "-pthread") set(CMAKE_CXX_FLAGS "-pthread") +# add a directory for cmake modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") +include(cmake/CPM.cmake) + # DSA_XENGINE may be built to run using CUDA. Future version may be # written for HIP or SYCL, which we call the # Target type. By default, the target is CUDA. @@ -97,8 +103,46 @@ if(GIT_FOUND) endif() endif(GIT_FOUND) -# EXTERNALS +# Use ExternalProject_Add for libtcc (borks with FetchContent) +# Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch) +include(ExternalProject) + +# Get TCC dependency +option(DSA_XENGINE_USE_TCC "Use TensorCoreCorrelators for correlatorss" ON) +if(DSA_XENGINE_USE_TCC) + option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF) + if(DSA_XENGINE_DOWNLOAD_TCC) + ExternalProject_Add(TCC + GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator + #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c + ) + else() + find_package(libtcc REQUIRED) + endif() +endif() + +# Get CUTLASS dependency +option(DSA_XENGINE_USE_CUTLASS "Use CUTLASS for GEMMs" ON) +if(DSA_XENGINE_USE_CUTLASS) + option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF) + if(DSA_XENGINE_DOWNLOAD_CUTLASS) + # Custom CUTLASS build + ExternalProject_Add(NvidiaCutlass + GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git + GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc + CMAKE_ARGS + "-DCUTLASS_NVCC_ARCHS_ENABLED=89" + "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(NvidiaCutlass REQUIRED) + endif() +endif() + +# Use FetchContent for lightweight dependencies include(FetchContent) + # Get psrdada dependency option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON) if(DSA_XENGINE_DOWNLOAD_PSRDADA) @@ -108,6 +152,8 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA) #GIT_TAG 008afa70393ae2df11efba0cc8d0b95cda599c02 ) FetchContent_MakeAvailable(PSRDada) +else() + find_package(psrdada REQUIRED) endif() # Get XGPU dependency @@ -119,27 +165,16 @@ if(DSA_XENGINE_DOWNLOAD_XGPU) #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 ) FetchContent_MakeAvailable(XGPU) +else() + find_package(xGPU REQUIRED) endif() -# Get TCC dependency -option(DSA_XENGINE_DOWNLOAD_TCC "Download and build TCC" ON) -if(DSA_XENGINE_DOWNLOAD_TCC) - FetchContent_Declare( - TCC - GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator - #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c - ) - FetchContent_MakeAvailable(TCC) -endif() -add_custom_command( - OUTPUT "file.txt" - WORKING_DIRECTORY ${TCC_SOURCE_DIR} - COMMAND "sed -i 's/libtcc\///g' libtcc/*.h libtcc/*.cc" - ) - - -# Add src +# Add src, legacy add_subdirectory(src) +option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF) +if(DSA_XENGINE_BUILD_LEGACY) + add_subdirectory(legacy) +endif() # Install project cmake targets include(CMakePackageConfigHelpers) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fbd3f3a..748f00b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,171 +1,58 @@ -#enable_language(CUDA) +enable_language(CUDA) +include_directories(..//include) include_directories(${PSRDada_SOURCE_DIR}/src) include_directories(${xGPU_SOURCE_DIR}/src) +include_directories(${NvidiaCutlass_DIR}/../../../include) +include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) -set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.so) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) +set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) +# Some simple CUTLASS examples to test linking/benching +#------------------------------------------------------ +add_executable(planar_complex planar_complex.cu) +target_link_libraries(planar_complex ${NvidiaCutlass_LIB}) -add_executable(test_write test_write.c) -target_link_libraries(test_write ${PSRDada_LIB}) +add_executable(10_planar_complex 10_planar_complex.cu) +target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB}) -add_executable(test_read test_read.c) -target_link_libraries(test_read ${PSRDada_LIB}) +add_executable(11_planar_complex_array 11_planar_complex_array.cu) +target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB}) +#------------------------------------------------------ -add_executable(dsaX_trigger dsaX_trigger.c) -target_link_libraries(dsaX_trigger ${PSRDada_LIB}) - -add_executable(dsaX_filTrigger dsaX_filTrigger.c) -target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) - -# DMH: Has a 'sigproc' dependency, low priority -if(0) - add_executable(splice_offline_beams splice_offline_beams.c) - target_link_libraries(splice_offline_beams ${PSRDada_LIB}) - - add_executable(dsaX_writeFil dsaX_writeFil.c) - target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) - - add_executable(dsaX_splice dsaX_splice.c) - target_link_libraries(dsaX_splice ${PSRDada_LIB}) - - add_executable(gpu_flagger gpu_flagger.cu) - target_link_libraries(gpu_flagger ${PSRDada_LIB}) -endif() - -add_executable(dsaX_store dsaX_store.c) -target_link_libraries(dsaX_store ${PSRDada_LIB}) - -add_executable(dsaX_fluff dsaX_fluff.c) -target_link_libraries(dsaX_fluff ${PSRDada_LIB}) - -# DMH: intrinsics compilation error -#add_executable(dsaX_reorder dsaX_reorder.c) -#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) - -# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: -#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] -# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; -add_executable(dsaX_nicdb dsaX_nicdb.c) -target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) - -add_executable(dsaX_dbnic dsaX_dbnic.c) -target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) - -add_executable(dsaX_capture dsaX_capture.c) -target_link_libraries(dsaX_capture ${PSRDada_LIB}) - -add_executable(dsaX_capture_thread dsaX_capture_thread.c) -target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) - -add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) -target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) - -add_executable(dsaX_split dsaX_split.c) -target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) - -add_executable(dsaX_merge dsaX_merge.c) -target_link_libraries(dsaX_merge ${PSRDada_LIB}) - -add_executable(dsaX_simplesplit dsaX_simplesplit.c) -target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) - -add_executable(dsaX_fake dsaX_fake.c) -target_link_libraries(dsaX_fake ${PSRDada_LIB}) - -add_executable(dsaX_splitup dsaX_splitup.c) -target_link_libraries(dsaX_splitup ${PSRDada_LIB}) - -add_executable(dsaX_copydb dsaX_copydb.c) -target_link_libraries(dsaX_copydb ${PSRDada_LIB}) - -# DMH: fitsio dependency -if(0) - add_executable(dsaX_writevis dsaX_writevis.c) - target_link_libraries(dsaX_writevis ${PSRDada_LIB}) -endif() - -# DMH: XGPU dependencies -add_executable(dsaX_wrangle dsaX_wrangle.c) -target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ${CUDA_nvml_LIBRARY}) - -add_executable(dsaX_testdada dsaX_testdada.c) -target_link_libraries(dsaX_testdada ${PSRDada_LIB}) +# DSA Fast Time Domain CUTLASS interface +#--------------------------------------- +add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu) +target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB}) +#--------------------------------------- +# DSA Fast Time Domain +#--------------------- add_executable(dsaX_bfCorr dsaX_bfCorr.cu) target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) - -# DMH: Fix CUBE error -add_executable(dsaX_xgpu dsaX_xgpu.cu) -target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) - -add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) -target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) - -add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) -target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) - -add_executable(fil2dada fil2dada.c) -target_link_libraries(fil2dada ${PSRDada_LIB}) - -add_executable(dumpfil dumpfil.c) -target_link_libraries(dumpfil ${PSRDada_LIB}) - -add_executable(dsaX_beamformer dsaX_beamformer.cu) -target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) - -add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) -target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) - -add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) -target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) +#--------------------- # install step for header files +#------------------------------ set(DSA_XENGINE_HEADERS # cmake-format: sortable dsaX_capture.h dsaX_capture_manythread.h dsaX_capture_pcap.h dsaX_def.h + dsaX_cutlass_interface.h ) install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) +#------------------------------ # install step for executables +#----------------------------- install(TARGETS # cmake-format: sortable - dsaX_beamformer - dsaX_beamformer_passon - dsaX_xgpu - dsaX_reorder_raw - dsaX_fake - dsaX_capture - dsaX_capture_thread - dsaX_capture_manythread - dsaX_dbnic - dsaX_nicdb - dsaX_split - dsaX_wrangle - fil2dada - dumpfil - dsaX_simplesplit - dsaX_store - dsaX_trigger - dsaX_filTrigger - dsaX_beamformer_offline - dsaX_splitup - cuda_correlator - dsaX_copydb dsaX_bfCorr - dsaX_merge - - #fitsio dep - # dsaX_writevis - - #sigproc dep - # dsaX_writeFil - # dsaX_splice - # gpu_flagger RUNTIME DESTINATION bin ) +#----------------------------- From 3ce38717c8f7c99db5c245d67f668c06e929c4cc Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:36:57 -0700 Subject: [PATCH 06/30] Move headers to include directory --- include/dsaX_cutlass_interface.h | 172 +++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 include/dsaX_cutlass_interface.h diff --git a/include/dsaX_cutlass_interface.h b/include/dsaX_cutlass_interface.h new file mode 100644 index 0000000..5aa753e --- /dev/null +++ b/include/dsaX_cutlass_interface.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/library/handle.h" + +using namespace cutlass; +using namespace gemm; +using namespace library; +using namespace layout; +using namespace reference; +using namespace device; + +// Result structure +struct Result { + + double runtime_ms; + double gflops; + Status status; + cudaError_t error; + bool passed; + + Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +// Command line options parsing (testing) +struct Options { + + bool help; + GemmCoord problem_size; + int batch_count; + complex alpha; + complex beta; + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(false), + iterations(20), + alpha(1), + beta() { } + + // Parses the command line + void parse(int argc, char const **args) { + + CommandLine cmd(argc, args); + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "dsaX_cutlass_interface\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/// Performance test environment for planar complex +class DSA_FTD_ComplexGEMM_CUTLASS { + + // Half-precision input and output + using Element = half_t; + + // Configurations for layouts and internal computation + using LayoutA = ColumnMajor; + using LayoutB = ColumnMajor; + using LayoutC = ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + Handle handle; + + GemmCoord problem_size; + int batch_count; + DeviceAllocation tensor_A; + DeviceAllocation tensor_B; + DeviceAllocation tensor_C; + DeviceAllocation tensor_D; + DeviceAllocation tensor_D_ref; + + DeviceAllocation ptr_A_real; + DeviceAllocation ptr_A_imag; + DeviceAllocation ptr_B_real; + DeviceAllocation ptr_B_imag; + DeviceAllocation ptr_C_real; + DeviceAllocation ptr_C_imag; + DeviceAllocation ptr_D_real; + DeviceAllocation ptr_D_imag; + + Element *ptr_A; + Element *ptr_B; + Element *ptr_C; + Element *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + typename LayoutA::Stride::Index lda; + typename LayoutB::Stride::Index ldb; + typename LayoutC::Stride::Index ldc; + typename LayoutC::Stride::Index ldd; + + int64_t imag_stride_A; + int64_t imag_stride_B; + int64_t imag_stride_C; + int64_t imag_stride_D; + +public: + // Constructors + DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); + DSA_FTD_ComplexGEMM_CUTLASS(); + + // Methods + void initialize(); + Result run(Options const &options); + + bool testing; +}; + From 8a50bd400aff1201fe8fdddcc765505b44dd8142 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:37:49 -0700 Subject: [PATCH 07/30] Move some code to a legacy folder, exempt from installation --- legacy/10_planar_complex.cu | 567 ++++++++++ legacy/11_planar_complex_array.cu | 628 +++++++++++ legacy/11_planar_complex_array.cu~ | 628 +++++++++++ legacy/CMakeLists.txt | 121 ++ legacy/CMakeLists.txt~ | 120 ++ legacy/Makefile | 208 ++++ legacy/correlator_header_dsaX.txt | 38 + legacy/cuda_correlator | Bin 0 -> 34272 bytes legacy/dsaX_beamformer.cu | 1128 +++++++++++++++++++ legacy/dsaX_beamformer.cu.wrk1 | 1003 +++++++++++++++++ legacy/dsaX_beamformer_offline.cu | 933 ++++++++++++++++ legacy/dsaX_beamformer_passon | Bin 0 -> 178600 bytes legacy/dsaX_beamformer_passon.cu | 1057 ++++++++++++++++++ legacy/dsaX_bfCorr.cu | 1286 +++++++++++++++++++++ legacy/dsaX_bigfake.c | 320 ++++++ legacy/dsaX_capture.c | 1080 ++++++++++++++++++ legacy/dsaX_capture.h | 131 +++ legacy/dsaX_capture_manythread.c | 1115 +++++++++++++++++++ legacy/dsaX_capture_manythread.c.bak | 1053 ++++++++++++++++++ legacy/dsaX_capture_manythread.h | 119 ++ legacy/dsaX_capture_pcap.c | 852 ++++++++++++++ legacy/dsaX_capture_pcap.h | 83 ++ legacy/dsaX_capture_thread.c | 1107 ++++++++++++++++++ legacy/dsaX_copydb.c | 273 +++++ legacy/dsaX_cuda_correlator.cu | 309 +++++ legacy/dsaX_cutlass_interface.cu | 315 ++++++ legacy/dsaX_cutlass_interface.cu~ | 315 ++++++ legacy/dsaX_cutlass_interface.h | 172 +++ legacy/dsaX_cutlass_interface.h~ | 174 +++ legacy/dsaX_dbnic.c | 435 ++++++++ legacy/dsaX_dbnic.c.bak | 381 +++++++ legacy/dsaX_def.h | 98 ++ legacy/dsaX_fake.c | 320 ++++++ legacy/dsaX_filTrigger.c | 559 ++++++++++ legacy/dsaX_fluff.c | 415 +++++++ legacy/dsaX_makeFil.c | 276 +++++ legacy/dsaX_merge.c | 580 ++++++++++ legacy/dsaX_nicdb.c | 483 ++++++++ legacy/dsaX_nicdb.c.bak | 434 ++++++++ legacy/dsaX_reorder.c | 515 +++++++++ legacy/dsaX_reorder_raw.c | 613 ++++++++++ legacy/dsaX_reorder_raw.c.bak | 672 +++++++++++ legacy/dsaX_reorder_raw.c.bak2 | 608 ++++++++++ legacy/dsaX_simplesplit.c | 362 ++++++ legacy/dsaX_splice.c | 201 ++++ legacy/dsaX_split.c | 601 ++++++++++ legacy/dsaX_splitup.c | 285 +++++ legacy/dsaX_store.c | 218 ++++ legacy/dsaX_testdada.c | 161 +++ legacy/dsaX_trigger.c | 585 ++++++++++ legacy/dsaX_wrangle | Bin 0 -> 99600 bytes legacy/dsaX_wrangle.c | 378 +++++++ legacy/dsaX_wrangleAndWrite.c | 365 ++++++ legacy/dsaX_writeFil.c | 486 ++++++++ legacy/dsaX_writevis.c | 428 +++++++ legacy/dsaX_xgpu.cu | 375 +++++++ legacy/dumpfil.c | 294 +++++ legacy/fil2dada.c | 521 +++++++++ legacy/flagger.c | 484 ++++++++ legacy/gpu_flagger.cu | 1547 ++++++++++++++++++++++++++ legacy/planar_complex.cu | 87 ++ legacy/planar_complex.cu~ | 85 ++ legacy/spectrometer_header.txt | 38 + legacy/splice_offline_beams | Bin 0 -> 32432 bytes legacy/splice_offline_beams.c | 132 +++ legacy/test_read.c | 279 +++++ legacy/test_write.c | 452 ++++++++ 67 files changed, 29888 insertions(+) create mode 100644 legacy/10_planar_complex.cu create mode 100644 legacy/11_planar_complex_array.cu create mode 100644 legacy/11_planar_complex_array.cu~ create mode 100644 legacy/CMakeLists.txt create mode 100644 legacy/CMakeLists.txt~ create mode 100644 legacy/Makefile create mode 100644 legacy/correlator_header_dsaX.txt create mode 100755 legacy/cuda_correlator create mode 100644 legacy/dsaX_beamformer.cu create mode 100644 legacy/dsaX_beamformer.cu.wrk1 create mode 100644 legacy/dsaX_beamformer_offline.cu create mode 100755 legacy/dsaX_beamformer_passon create mode 100644 legacy/dsaX_beamformer_passon.cu create mode 100644 legacy/dsaX_bfCorr.cu create mode 100644 legacy/dsaX_bigfake.c create mode 100644 legacy/dsaX_capture.c create mode 100644 legacy/dsaX_capture.h create mode 100644 legacy/dsaX_capture_manythread.c create mode 100644 legacy/dsaX_capture_manythread.c.bak create mode 100644 legacy/dsaX_capture_manythread.h create mode 100644 legacy/dsaX_capture_pcap.c create mode 100644 legacy/dsaX_capture_pcap.h create mode 100644 legacy/dsaX_capture_thread.c create mode 100644 legacy/dsaX_copydb.c create mode 100644 legacy/dsaX_cuda_correlator.cu create mode 100644 legacy/dsaX_cutlass_interface.cu create mode 100644 legacy/dsaX_cutlass_interface.cu~ create mode 100644 legacy/dsaX_cutlass_interface.h create mode 100644 legacy/dsaX_cutlass_interface.h~ create mode 100644 legacy/dsaX_dbnic.c create mode 100644 legacy/dsaX_dbnic.c.bak create mode 100644 legacy/dsaX_def.h create mode 100644 legacy/dsaX_fake.c create mode 100644 legacy/dsaX_filTrigger.c create mode 100644 legacy/dsaX_fluff.c create mode 100644 legacy/dsaX_makeFil.c create mode 100644 legacy/dsaX_merge.c create mode 100644 legacy/dsaX_nicdb.c create mode 100644 legacy/dsaX_nicdb.c.bak create mode 100644 legacy/dsaX_reorder.c create mode 100644 legacy/dsaX_reorder_raw.c create mode 100644 legacy/dsaX_reorder_raw.c.bak create mode 100644 legacy/dsaX_reorder_raw.c.bak2 create mode 100644 legacy/dsaX_simplesplit.c create mode 100644 legacy/dsaX_splice.c create mode 100644 legacy/dsaX_split.c create mode 100644 legacy/dsaX_splitup.c create mode 100644 legacy/dsaX_store.c create mode 100644 legacy/dsaX_testdada.c create mode 100644 legacy/dsaX_trigger.c create mode 100755 legacy/dsaX_wrangle create mode 100644 legacy/dsaX_wrangle.c create mode 100644 legacy/dsaX_wrangleAndWrite.c create mode 100644 legacy/dsaX_writeFil.c create mode 100644 legacy/dsaX_writevis.c create mode 100644 legacy/dsaX_xgpu.cu create mode 100644 legacy/dumpfil.c create mode 100644 legacy/fil2dada.c create mode 100644 legacy/flagger.c create mode 100644 legacy/gpu_flagger.cu create mode 100644 legacy/planar_complex.cu create mode 100644 legacy/planar_complex.cu~ create mode 100644 legacy/spectrometer_header.txt create mode 100755 legacy/splice_offline_beams create mode 100644 legacy/splice_offline_beams.c create mode 100644 legacy/test_read.c create mode 100644 legacy/test_write.c diff --git a/legacy/10_planar_complex.cu b/legacy/10_planar_complex.cu new file mode 100644 index 0000000..9e0915d --- /dev/null +++ b/legacy/10_planar_complex.cu @@ -0,0 +1,567 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex GEMM + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting + the batched strided mode. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile + size, and alignment. This can result in long compile times. + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn + + $ make 10_planar_complex + + $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "10_planar_complex example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/10_planar_complex/10_planar_complex --batch=7 --m=1024 --n=512 --k=1024 \\\n" + << " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + using ElementA = cutlass::half_t; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = cutlass::half_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = cutlass::half_t; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched strided GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + ElementA *ptr_A = tensor_A.get(); + ElementB *ptr_B = tensor_B.get(); + ElementC *ptr_C = tensor_C.get(); + ElementC *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMMs + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex( + problem_size.m(), // GEMM M dimension + problem_size.n(), // GEMM N dimension + problem_size.k(), // GEMM K dimension + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + ptr_A, // Pointer to real part of A matrix + ptr_A + imag_stride_A, // Pointer to imaginary part of A matrix + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + ptr_B, // Pointer to real part of B matrix + ptr_B + imag_stride_B, // Pointer to imaginary part of B matrix + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C, // Pointer to real part of C matrix + ptr_C + imag_stride_C, // Pointer to imaginary part of C matrix + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D, // Pointer to real part of D matrix + ptr_D + imag_stride_D, // Pointer to imaginary part of D matrix + ldd, // Leading dimension of real part of D matrix + ldd, // Leading dimension of imaginary part of D matrix + + batch_count, // Number of batched elements + + batch_stride_A, // Stride between batches of real parts of A matrix + batch_stride_A, // Stride between batches of imaginary parts of A matrix + + batch_stride_B, // Stride between batches of real parts of B matrix + batch_stride_B, // Stride between batches of imaginary parts of B matrix + + batch_stride_C, // Stride between batches of real parts of C matrix + batch_stride_C, // Stride between batches of imaginary parts of C matrix + + batch_stride_D, // Stride between batches of real parts of D matrix + batch_stride_D // Stride between batches of imaginary parts of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMMs are complete + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + ElementC epsilon = 0.1_hf; + ElementC nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/legacy/11_planar_complex_array.cu b/legacy/11_planar_complex_array.cu new file mode 100644 index 0000000..ba94b60 --- /dev/null +++ b/legacy/11_planar_complex_array.cu @@ -0,0 +1,628 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex Array Example + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which + execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays + in global memory. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn + + $ make 11_planar_complex_array + + $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "11_planar_complex_array example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + // Half-precision input and output + using Element = cutlass::half_t; + + // Configurations for layouts and internal computation + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + cutlass::DeviceAllocation ptr_A_real; + cutlass::DeviceAllocation ptr_A_imag; + cutlass::DeviceAllocation ptr_B_real; + cutlass::DeviceAllocation ptr_B_imag; + cutlass::DeviceAllocation ptr_C_real; + cutlass::DeviceAllocation ptr_C_imag; + cutlass::DeviceAllocation ptr_D_real; + cutlass::DeviceAllocation ptr_D_imag; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); + + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + Element *ptr_A = tensor_A.get(); + Element *ptr_B = tensor_B.get(); + Element *ptr_C = tensor_C.get(); + Element *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Configure pointers in global memory + // + + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = { + { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} + }; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + cudaError_t error = cudaMemcpy( + tensor.ptr_real + idx, + &ptr_real, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + + error = cudaMemcpy( + tensor.ptr_imag + idx, + &ptr_imag, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + } + } + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex_array( + + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + Element, LayoutA, + Element, LayoutB, + Element, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/legacy/11_planar_complex_array.cu~ b/legacy/11_planar_complex_array.cu~ new file mode 100644 index 0000000..23722b0 --- /dev/null +++ b/legacy/11_planar_complex_array.cu~ @@ -0,0 +1,628 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex Array Example + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which + execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays + in global memory. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn + + $ make 11_planar_complex_array + + $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "11_planar_complex_array example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + // Half-precision input and output + using Element = cutlass::half_t; + + // Configurations for layouts and internal computation + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + cutlass::DeviceAllocation ptr_A_real; + cutlass::DeviceAllocation ptr_A_imag; + cutlass::DeviceAllocation ptr_B_real; + cutlass::DeviceAllocation ptr_B_imag; + cutlass::DeviceAllocation ptr_C_real; + cutlass::DeviceAllocation ptr_C_imag; + cutlass::DeviceAllocation ptr_D_real; + cutlass::DeviceAllocation ptr_D_imag; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); + + } + + void initialize_rand() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + Element *ptr_A = tensor_A.get(); + Element *ptr_B = tensor_B.get(); + Element *ptr_C = tensor_C.get(); + Element *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Configure pointers in global memory + // + + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = { + { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} + }; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + cudaError_t error = cudaMemcpy( + tensor.ptr_real + idx, + &ptr_real, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + + error = cudaMemcpy( + tensor.ptr_imag + idx, + &ptr_imag, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + } + } + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex_array( + + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + Element, LayoutA, + Element, LayoutB, + Element, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/legacy/CMakeLists.txt b/legacy/CMakeLists.txt new file mode 100644 index 0000000..b456550 --- /dev/null +++ b/legacy/CMakeLists.txt @@ -0,0 +1,121 @@ +enable_language(CUDA) + +include_directories(../include) +include_directories(${PSRDada_SOURCE_DIR}/src) +include_directories(${xGPU_SOURCE_DIR}/src) + +set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + +# DSA Fast Time Domain functions +#------------------------------- +add_executable(test_write test_write.c) +target_link_libraries(test_write ${PSRDada_LIB}) + +add_executable(test_read test_read.c) +target_link_libraries(test_read ${PSRDada_LIB}) + +add_executable(dsaX_trigger dsaX_trigger.c) +target_link_libraries(dsaX_trigger ${PSRDada_LIB}) + +add_executable(dsaX_filTrigger dsaX_filTrigger.c) +target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) + +# DMH: Has a 'sigproc' dependency, low priority +if(0) + add_executable(splice_offline_beams splice_offline_beams.c) + target_link_libraries(splice_offline_beams ${PSRDada_LIB}) + + add_executable(dsaX_writeFil dsaX_writeFil.c) + target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) + + add_executable(dsaX_splice dsaX_splice.c) + target_link_libraries(dsaX_splice ${PSRDada_LIB}) + + add_executable(gpu_flagger gpu_flagger.cu) + target_link_libraries(gpu_flagger ${PSRDada_LIB}) +endif() + +add_executable(dsaX_store dsaX_store.c) +target_link_libraries(dsaX_store ${PSRDada_LIB}) + +add_executable(dsaX_fluff dsaX_fluff.c) +target_link_libraries(dsaX_fluff ${PSRDada_LIB}) + +# DMH: intrinsics compilation error +#add_executable(dsaX_reorder dsaX_reorder.c) +#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) + +# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: +#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] +# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; +add_executable(dsaX_nicdb dsaX_nicdb.c) +target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) + +add_executable(dsaX_dbnic dsaX_dbnic.c) +target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) + +add_executable(dsaX_capture dsaX_capture.c) +target_link_libraries(dsaX_capture ${PSRDada_LIB}) + +add_executable(dsaX_capture_thread dsaX_capture_thread.c) +target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) + +add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) +target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) + +add_executable(dsaX_split dsaX_split.c) +target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) + +add_executable(dsaX_merge dsaX_merge.c) +target_link_libraries(dsaX_merge ${PSRDada_LIB}) + +add_executable(dsaX_simplesplit dsaX_simplesplit.c) +target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) + +add_executable(dsaX_fake dsaX_fake.c) +target_link_libraries(dsaX_fake ${PSRDada_LIB}) + +add_executable(dsaX_splitup dsaX_splitup.c) +target_link_libraries(dsaX_splitup ${PSRDada_LIB}) + +add_executable(dsaX_copydb dsaX_copydb.c) +target_link_libraries(dsaX_copydb ${PSRDada_LIB}) + +# DMH: fitsio dependency +if(0) + add_executable(dsaX_writevis dsaX_writevis.c) + target_link_libraries(dsaX_writevis ${PSRDada_LIB}) +endif() + +# DMH: XGPU dependencies +add_executable(dsaX_wrangle dsaX_wrangle.c) +target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB}) + +add_executable(dsaX_testdada dsaX_testdada.c) +target_link_libraries(dsaX_testdada ${PSRDada_LIB}) + +add_executable(dsaX_xgpu dsaX_xgpu.cu) +target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) + +add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) +target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + +add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) +target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) + +add_executable(fil2dada fil2dada.c) +target_link_libraries(fil2dada ${PSRDada_LIB}) + +add_executable(dumpfil dumpfil.c) +target_link_libraries(dumpfil ${PSRDada_LIB}) + +add_executable(dsaX_beamformer dsaX_beamformer.cu) +target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) +target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) +target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) +#------------------------------------------------------ diff --git a/legacy/CMakeLists.txt~ b/legacy/CMakeLists.txt~ new file mode 100644 index 0000000..0783d51 --- /dev/null +++ b/legacy/CMakeLists.txt~ @@ -0,0 +1,120 @@ +enable_language(CUDA) + +include_directories(${PSRDada_SOURCE_DIR}/src) +include_directories(${xGPU_SOURCE_DIR}/src) + +set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + +# DSA Fast Time Domain functions +#------------------------------- +add_executable(test_write test_write.c) +target_link_libraries(test_write ${PSRDada_LIB}) + +add_executable(test_read test_read.c) +target_link_libraries(test_read ${PSRDada_LIB}) + +add_executable(dsaX_trigger dsaX_trigger.c) +target_link_libraries(dsaX_trigger ${PSRDada_LIB}) + +add_executable(dsaX_filTrigger dsaX_filTrigger.c) +target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) + +# DMH: Has a 'sigproc' dependency, low priority +if(0) + add_executable(splice_offline_beams splice_offline_beams.c) + target_link_libraries(splice_offline_beams ${PSRDada_LIB}) + + add_executable(dsaX_writeFil dsaX_writeFil.c) + target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) + + add_executable(dsaX_splice dsaX_splice.c) + target_link_libraries(dsaX_splice ${PSRDada_LIB}) + + add_executable(gpu_flagger gpu_flagger.cu) + target_link_libraries(gpu_flagger ${PSRDada_LIB}) +endif() + +add_executable(dsaX_store dsaX_store.c) +target_link_libraries(dsaX_store ${PSRDada_LIB}) + +add_executable(dsaX_fluff dsaX_fluff.c) +target_link_libraries(dsaX_fluff ${PSRDada_LIB}) + +# DMH: intrinsics compilation error +#add_executable(dsaX_reorder dsaX_reorder.c) +#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) + +# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: +#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] +# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; +add_executable(dsaX_nicdb dsaX_nicdb.c) +target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) + +add_executable(dsaX_dbnic dsaX_dbnic.c) +target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) + +add_executable(dsaX_capture dsaX_capture.c) +target_link_libraries(dsaX_capture ${PSRDada_LIB}) + +add_executable(dsaX_capture_thread dsaX_capture_thread.c) +target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) + +add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) +target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) + +add_executable(dsaX_split dsaX_split.c) +target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) + +add_executable(dsaX_merge dsaX_merge.c) +target_link_libraries(dsaX_merge ${PSRDada_LIB}) + +add_executable(dsaX_simplesplit dsaX_simplesplit.c) +target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) + +add_executable(dsaX_fake dsaX_fake.c) +target_link_libraries(dsaX_fake ${PSRDada_LIB}) + +add_executable(dsaX_splitup dsaX_splitup.c) +target_link_libraries(dsaX_splitup ${PSRDada_LIB}) + +add_executable(dsaX_copydb dsaX_copydb.c) +target_link_libraries(dsaX_copydb ${PSRDada_LIB}) + +# DMH: fitsio dependency +if(0) + add_executable(dsaX_writevis dsaX_writevis.c) + target_link_libraries(dsaX_writevis ${PSRDada_LIB}) +endif() + +# DMH: XGPU dependencies +add_executable(dsaX_wrangle dsaX_wrangle.c) +target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB}) + +add_executable(dsaX_testdada dsaX_testdada.c) +target_link_libraries(dsaX_testdada ${PSRDada_LIB}) + +add_executable(dsaX_xgpu dsaX_xgpu.cu) +target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) + +add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) +target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + +add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) +target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) + +add_executable(fil2dada fil2dada.c) +target_link_libraries(fil2dada ${PSRDada_LIB}) + +add_executable(dumpfil dumpfil.c) +target_link_libraries(dumpfil ${PSRDada_LIB}) + +add_executable(dsaX_beamformer dsaX_beamformer.cu) +target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) +target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) + +add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) +target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) +#------------------------------------------------------ diff --git a/legacy/Makefile b/legacy/Makefile new file mode 100644 index 0000000..0de1991 --- /dev/null +++ b/legacy/Makefile @@ -0,0 +1,208 @@ +# This is set up for the CORR containers + +CC=gcc +CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc +CDEPS1=dsaX_def.h dsaX_capture_manythread.h +CDEPS2=dsaX_def.h dsaX_capture.h +LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu + +#LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap +#CDEPS3=dsaX_def.h dsaX_capture_pcap.h + +CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++ +CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 + + +.DEFAULT_GOAL := all + +test_write.o: test_write.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +test_write: test_write.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +test_read.o: test_read.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +test_read: test_read.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_trigger.o: dsaX_trigger.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_trigger: dsaX_trigger.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_filTrigger.o: dsaX_filTrigger.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_filTrigger: dsaX_filTrigger.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +splice_offline_beams.o: splice_offline_beams.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +splice_offline_beams: splice_offline_beams.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_store.o: dsaX_store.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_store: dsaX_store.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_fluff.o: dsaX_fluff.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_fluff: dsaX_fluff.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_reorder.o: dsaX_reorder.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_reorder: dsaX_reorder.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_dbnic: dsaX_dbnic.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_nicdb.o: dsaX_nicdb.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_nicdb: dsaX_nicdb.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_capture.o: dsaX_capture.c $(CDEPS2) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_capture: dsaX_capture.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_capture_thread.o: dsaX_capture_thread.c $(CDEPS2) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_capture_thread: dsaX_capture_thread.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_capture_manythread.o: dsaX_capture_manythread.c $(CDEPS2) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_capture_manythread: dsaX_capture_manythread.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_split.o: dsaX_split.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_split: dsaX_split.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_merge.o: dsaX_merge.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_merge: dsaX_merge.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_simplesplit.o: dsaX_simplesplit.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_simplesplit: dsaX_simplesplit.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + + +dsaX_fake.o: dsaX_fake.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_fake: dsaX_fake.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_splitup.o: dsaX_splitup.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_splitup: dsaX_splitup.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_copydb.o: dsaX_copydb.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_copydb: dsaX_copydb.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_writevis.o: dsaX_writevis.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_writevis: dsaX_writevis.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_wrangle.o: dsaX_wrangle.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_wrangle: dsaX_wrangle.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_testdada.o: dsaX_testdada.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_testdada: dsaX_testdada.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_writeFil.o: dsaX_writeFil.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_splice.o: dsaX_splice.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_writeFil: dsaX_writeFil.o + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_reorder_raw.o: dsaX_reorder_raw.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dsaX_reorder_raw: dsaX_reorder_raw.o $(CDEPS1) + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +fil2dada.o: fil2dada.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +fil2dada: fil2dada.o $(CDEPS1) + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dumpfil.o: dumpfil.c $(CDEPS1) + $(CC) -c -o $@ $< $(CFLAGS1) + +dumpfil: dumpfil.o $(CDEPS1) + $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) + +dsaX_xgpu: dsaX_xgpu.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +cuda_correlator: cuda_correlator.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +gpu_flagger: gpu_flagger.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +dsaX_beamformer: dsaX_beamformer.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +dsaX_bfCorr: dsaX_bfCorr.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +dsaX_beamformer_passon: dsaX_beamformer_passon.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +dsaX_beamformer_offline: dsaX_beamformer_offline.cu + $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) + +.PHONY: clean all + +clean: + rm -f *.o *~ dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_beamformer_offline dsaX_splice dsaX_filTrigger cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge + +all: dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_capture_thread dsaX_capture_manythread dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_filTrigger dsaX_beamformer_offline dsaX_splice dsaX_splitup cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge + + + + diff --git a/legacy/correlator_header_dsaX.txt b/legacy/correlator_header_dsaX.txt new file mode 100644 index 0000000..c8b86e9 --- /dev/null +++ b/legacy/correlator_header_dsaX.txt @@ -0,0 +1,38 @@ +ACC_LEN 1 +BANDWIDTH -250 +BW -250 +CFREQ 1405 +CHAN_AV 0 +DEC 00:00:00.000 +DSB 0 +FILE_SIZE 2415919104 +FREQ 1405.000000 +FSCRUNCH 1 +HDR_SIZE 4096 +HDR_VERSION 1.0 +INSTRUMENT DSAX +MODE RAW +NBEAM 1 +NBIT 4 +NCHAN 2048 +NDIM 1 +NPOL 2 +N_PROD 1 +OBSERVER DSA +OBS_OFFSET 0 +OBS_UNIT SECONDS +OBS_VAL 0000.0000 +PID P000 +RA 00:00:00.000 +RECEIVER SANDY +RESOLUTION 4096 +SOURCE TEST +TRANSFER_SIZE 126562550000000 +TELESCOPE DSA-10 +TSAMP 64 +TSCRUNCH 1 +ANTENNAS 1-2-5-3 +NANT 2 +UTC_START 2015-08-07-17:07:28 +FILE_NUMBER 0 + diff --git a/legacy/cuda_correlator b/legacy/cuda_correlator new file mode 100755 index 0000000000000000000000000000000000000000..a8b94c759c2da5b87ab4c1a740138d0ad7d75073 GIT binary patch literal 34272 zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(` z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9 zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N#bzs8E6%Repv1 z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z! z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$??970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1 zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq( zZnmAPm7Rxk^4+iSF@3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{ zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`} zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M( z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOqekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQpm1z(>(6sC{2`+fdIa681(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb> zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=> zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5; zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj z5t)3_f}d}Zf760rV8MmDj6SfV(!8@=bqCqKZ%@e>mW^Is&-E~j zFL`#waYOQVlV=wkuaNv5x>Ueeg*k+$qz}so;#)CC{!i z-XZx5$+OFhdnA7zd3KfYTFF5m_i{PX15f;Yc{u%OY;^X@z{}lQ8AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@ zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc zF7>X_c6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL) z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKTnJs+|?{ZecB{oiXDeX^A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^? zRQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9? z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%; zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JYG_yM6A|sEBegd(kk(X`Bt6R$N zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm z2IIz;q{gj~dK*%@MW*gUKC{D<+S zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d zOWpta#6;Ki57jEGkjs+UL7Cls9qs-R_{wk066`A1`3%cFNoj8J7DG z1%!qvbT@=#xhrvLk8sz6_B}=GH?9e zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@ zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^sjFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+ z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi% zuOGU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9 z@G$76w$azyMxSlJNe@y)MiHm)8wbs^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME( zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-* zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZo}I~| zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%& z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp>}Au{M*ohOdqgpn+LGU7)*qa zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~#F z!JWv}W>g-hTjJK}N6gsV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_ zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^ zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_* zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUvDr;Y$~9C zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W) zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$ z3ruwbqHN-BuUF#)zOl@jJ5zq}CD zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG) zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E< zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY z0F86c39B6UsIc*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT; z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwfMbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$ zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@ zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D zZ^iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK! z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$QsavN2%l6qvi?OXr zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3 zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69 z*@2{2Vd?x`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI1Q20jRX21C{Qe|%%5yso7;*yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+ zFer6ApuvZ!KF^dqsKATDknF)JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG* zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~- zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3` zm&>@*?O4zSe&t_Er%H#p}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52 zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=% zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5}hycR`?=#i$pSVt6CJc z-YoNClPodGQeif+>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1 zL~SOqbs#(p9-kCFQTmWW`QxLo3?E zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104 zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^ViwaoY z_gl8=I0ROgppw^ zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju zIR8QW+>4C?Zjs@A!1J|})d@;ddv*fk#isU`WKV)+2O1v zCP*iZt3%Sx};{3h^9QzYy71K|8_hn`^4w;4|> zTu*M{G|BKU1?;?5055^TeEC)Y&-Zm<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ) z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa>EHNZKZ3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE z5c|5hmn~h+38_3n^7}u5YhzFhAOq`O=g5 znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-jThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^ z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+ zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1 zp};maD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*!hEw zetKRNI3Cr~`g*u#uj@EjD1;kLk}kiN)AS~!^S9@k z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!) z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E} zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz| z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j3Qj~b>#4qH}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ- z9akcfGymN{u#cO&^ jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T literal 0 HcmV?d00001 diff --git a/legacy/dsaX_beamformer.cu b/legacy/dsaX_beamformer.cu new file mode 100644 index 0000000..afdda70 --- /dev/null +++ b/legacy/dsaX_beamformer.cu @@ -0,0 +1,1128 @@ +// -*- c++ -*- +/* will implement the 64-input beamformer + +does N beams of 256 + +order is (taking time as 8x 8.192e-6) +[2048 time, 63 antennas, 768 channels, 2 pol, r/i] +Load in 16 times at a time, so that we have (in units of what needs to be added) +[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i] + +This should be reordered on the cpu to +[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i] + +The first kernel, launched with 1536 blocks of 64 threads, needs to + - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. + - each thread processes 4 beams, adding everything. for each beam, + + for each chunnel and pol, calculate weights using cal weights and ant positions, + + add everything into output array +Output array has order [beam, 96 frequency, 16 time] + +Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights + +Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. +these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). +after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. + +Do everything in floating point until second kernel. + +Second kernel will simply add times and adjacent channels and pick leading 8 bits +Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. + + */ + +#define THRUST_IGNORE_CUB_VERSION_CHECK + +#include +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "cuda_fp16.h" +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include +#include + +#include +using namespace nvcuda; + +// global variables +int DEBUG = 0; +const float sep = 1.0; + +// kernel for summing for online bp +// input array has order [beam, 48 frequency, 2 pol, 16 time] +// need to output to [beam, 48 frequency] +// run with 256*48=12288 blocks and 32 threads +__global__ +void badder(float *input, float *output) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 256*48=12288 + int tidx = threadIdx.x; // assume 32 + //int fidx = 2*(bidx % 24); + int beamidx = (int)(bidx / 48); + + // declare shared mem + volatile __shared__ float data[32]; // data block to be summed + + // transfer from input to shared mem + data[tidx] = input[bidx*32+tidx]; + + // sync + __syncthreads(); + + // complete sum + if (tidx<16) { + data[tidx] += data[tidx+16]; // over pols + data[tidx] += data[tidx+8]; + data[tidx] += data[tidx+4]; + data[tidx] += data[tidx+2]; + data[tidx] += data[tidx+1]; + } + // now tidx = 0, 4, 8, 12 are what we want! + + __syncthreads(); + + // store + if (tidx == 0) + output[bidx] += data[0]; + +} + + +// kernel for summing and requantizing +// input array has order [beam, 48 frequency, 2 pol, 16 time] +// need to output to [4 time, beam, 48 frequency] +// bp is scale factor for each beam +// run with 256*48=12288 blocks and 32 threads +__global__ +void adder(float *input, unsigned char *output, float *bp) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 256*48=12288 + int tidx = threadIdx.x; // assume 32 + //int fidx = 2*(bidx % 24); + int beamidx = (int)(bidx / 48); + + // declare shared mem + volatile __shared__ float data[32]; // data block to be summed + + // transfer from input to shared mem + data[tidx] = input[bidx*32+tidx]; + + // sync + __syncthreads(); + + // complete sum + if (tidx<16) { + data[tidx] += data[tidx+16]; // over pols + data[tidx] += data[tidx+2]; + data[tidx] += data[tidx+1]; + } + // now tidx = 0, 4, 8, 12 are what we want! + + __syncthreads(); + + // store + if (tidx == 0) + output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); + if (tidx == 4) + output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); + if (tidx == 8) + output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); + if (tidx == 12) + output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); + +} + +// kernel for promotion +/* +orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] +input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] +output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] +promoted to half precision + +launch with 16*48*NANT blocks of 32 threads + + */ +__global__ void promoter(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume 16*48*NANT + int tidx = threadIdx.x; // assume 32 + int iidx = bidx*32+tidx; + int pol = (int)(tidx % 2); + int chunnel = (int)(tidx / 2); + + /*int ant = (int)(bidx % NANT); + int time_chan = (int)(bidx / NANT); + int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ + + int chan = (int)(bidx % 48); + int time_ant = (int)(bidx / 48); + int tim = (int)(time_ant / NANT); + int ant = (int)(time_ant % NANT); + int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; + + //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); + //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); + inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); + ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + +} + +// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels +// for first time, launch with 3072, 32 +__global__ void printer(half *inr, half *ini) { + + int idx = blockIdx.x*32+threadIdx.x; + float ir = __half2float(inr[idx]); + float ii = __half2float(ini[idx]); + + int chunnel = (int)(threadIdx.x % 16); + int channel = (int)(blockIdx.x/64); + int tt = (int)(blockIdx.x % 64); + int pol = (int)(tt/32); + int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); + + if (ir!=0. || ii!=0.) { + printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); + } + +} + +// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels +// launch with 4,32 +__global__ void rms_printer(half *inr, half *ini) { + + int idx = blockIdx.x*32+threadIdx.x; + int pol = (int)(idx / 64); + int ant = (int)(idx % 64); + + float rms = 0., val; + for (int i=0;i<16;i++) { + + idx = 786432 + 49152 + pol*64*16 + ant*16 + i; + + val = __half2float(inr[idx]); + rms += val*val; + val = __half2float(ini[idx]); + rms += val*val; + + } + rms = sqrt(rms/32.); + + printf("ANTPOL_RMS %d %d %f\n",ant,pol,rms); + +} + + + +// kernel for beamforming +/* + +Assumes that up to NANT antennas (nominally 63) are populated. + +Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) + +Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di + +Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. +for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) +use __float2int_rn, cosf, sinf intrinsics. + +Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. +Do it in tiles of 16 beams and 16 ants for + +Output array has order [beam, 48 frequency, 2 pol, 16 time] + +inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag +wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + +launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization + = 24576 blocks + +*/ +__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 24576 + int tidx = threadIdx.x; // assume 32 + int orig_bidx = (int)(bidx / 16); + int beam_tile = (int)(bidx % 16); + int stuff_tile = (int)(beam_tile % 4); + int data_offset = orig_bidx*1024; // offset for first part of data + int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight + weight_offset *= 16384; + int idx1, idx2; + int f_idx = (int)(orig_bidx % 96); + int tim_idx = (int)(orig_bidx / 96); + int oidx = f_idx*16 + tim_idx; + + // shared memory for convenience + __shared__ half summr[16][16]; // beam, chunnel + __shared__ float summi[16][16]; // beam, chunnel + + // accumulate real and imag parts into [16 beam x 16 f] fragments + // Declare the fragments. + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment wr_inr_frag; + wmma::fragment wr_ini_frag; + wmma::fragment wi_inr_frag; + wmma::fragment wi_ini_frag; + wmma::fragment ib_frag; + wmma::fragment final_frag; + + + // zero out accumulators + wmma::fill_fragment(wr_inr_frag, 0.0f); + wmma::fill_fragment(wr_ini_frag, 0.0f); + wmma::fill_fragment(wi_inr_frag, 0.0f); + wmma::fill_fragment(wi_ini_frag, 0.0f); + wmma::fill_fragment(ib_frag, 0.0f); + + // IB + if (stuffants==2) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + + } + + } + + // one ant per beam + if (stuffants==1) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + + } + if (stuffants!=1) { + + // loop over ant tiles + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + // copy weight and data to fragments, and multiply to accumulators + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); + + wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); + + wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); + + } + + // form real and imaginary matrices + for(int i=0; i < wr_inr_frag.num_elements; i++) { + wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real + wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag + wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared + } + } + + // at this stage the matrices are [beam, chunnel], and need to be summed over columns + + __syncthreads(); + + // copy back to shared mem + half *p1; + float *p2, tmp; + p1 = &summr[0][0]; + wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + if (stuffants!=1) { + + // now do thread reduction using multiplication by unity + wmma::fill_fragment(final_frag, 0.0f); + wmma::fill_fragment(b_frag, 1.0f); + wmma::load_matrix_sync(a_frag, p1, 16); + wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + // store + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; + } + + + } + + if (stuffants==1) { + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; + } + } + if (stuffants==2) { + + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); + tmp = 0.; + for (int i=0;i<16;i++) tmp += summi[i][i]; + if (tidx==0 && beam_tile==0) + output[(beam_tile*16+tidx)*1536 + oidx] = tmp; + + } + +} + +// kernel to calculate weights - needed because weights are halfs +// launch with 256 threads in 6144 blocks +__global__ +void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { + + // assume 256 threads in 6144 blocks + int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile + int tidx = threadIdx.x; + int f = (int)(bidx / 128); + int cc = (int)(bidx % 128); + int pol = (int)(cc / 64); + cc = (int)(cc % 64); + int beam_tile = (int)(cc / 4); + int ant_tile = (int)(cc % 4); + int beam_i = (int)(tidx / 16); + int ant_i = (int)(tidx % 16); + + int beam = beam_tile*16+beam_i; + int ant = ant_tile*16+ant_i; + int i = bidx*256+tidx; + int widx = ant*NW*2*2 + f*2*2 + pol*2; + + float theta = sep*(127.-beam*1.)*PI/10800.; // radians + float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate + float twr = cos(afac*antpos[ant]); + float twi = sin(afac*antpos[ant]); + + wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); + wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); + + +} + + +// function prototypes +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); +int init_weights(char *fnam, float *antpos, float *weights, char *flagants); +void reorder_block(char *block); +void calc_bp(float *data, float *bp, int pr); +void calc_allbp(float *data, float *bp); +void ret_med_bp(float *bp); +void ret_many_bp(float *many_bp, float *bp); + +// performs massive summation to calculate bp +// input array has order [beam, 96 frequency, 16 time] +// bp has size 48 - no way to avoid strided memory access +// returns factor to correct data +void calc_bp(float *data, float *bp, int pr) { + + int i=0; + + for (int b=0;b<256;b++) { + for (int f=0;f<48;f++) { + for (int a=0;a<32;a++) { + bp[b] += data[i]; + if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); + i++; + } + } + } + +} + +void calc_allbp(float *data, float *bp) { + + int i=0; + + for (int st=0;st *(const float*)elem2; +} + +void ret_med_bp(float *bp) { + + qsort(bp, 256, sizeof(float), cmpfunc); + float medval = 0.5*(bp[127]+bp[128]); + for (int i=0;i<256;i++) + bp[i] = medval; + +} + +void ret_many_bp(float *many_bp, float *bp, float medbp) { + + for (int i=0;i<256;i++) { + bp[i] = 0.; + for (int j=0;j0.1) + bp[i] = medbp; + } + +} + +// performs cpu reorder of block to be loaded to GPU +void reorder_block(char * block) { + + // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] + // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] + // 24576*NANT in total. 1536*NANT per time + + char * output = (char *)malloc(sizeof(char)*24576*NANT); + + for (int i=0;i<16;i++) { // over time + for (int j=0;j= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + int nints = NPACKETS / 16; + uint64_t nbytes_per_int = block_size / nints; + uint64_t nbytes_per_out = block_out / nints; + char * block; + unsigned char * output_buffer; + output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // allocate host and device memory for calculations + //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag + //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + char *d_indata[NSTREAMS]; + unsigned char *d_outdata[NSTREAMS]; + float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; + half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; + float *d_added[NSTREAMS], *h_added; + h_added = (float *)malloc(sizeof(float)*256*48*NSTREAMS); + cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions + cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights + cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs + cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass + cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight + cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight + cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); + + float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); + char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); + float *bp = (float *)malloc(sizeof(float)*256); + float *frozen_bp = (float *)malloc(sizeof(float)*256); + float *many_bp = (float *)malloc(sizeof(float)*256*NBP); + int bpctr = 0; + float medbp; + unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); + + // streams and device + cudaStream_t stream[NSTREAMS]; + for (int st=0;st d1(d_inr[st]); + thrust::fill(d1, d1+16*48*2*64*16, 0.0); + thrust::device_ptr d2(d_ini[st]); + thrust::fill(d2, d2+16*48*2*64*16, 0.0); + } + + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + int blockct = 0; + int slow_down = 0; + int prestart = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + blockct ++; + + // DO STUFF + + // calc weights + init_weights(fnam,antpos,weights,flagants); + cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); + calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); + if (DEBUG) syslog(LOG_INFO,"Finished with weights"); + + // zero out d_added + for (int st=0;st>>(d_indata[st], d_inr[st], d_ini[st]); + + // do printing if needed + if (bst==0 && slow_down==0) + rms_printer<<<4, 32, 0, stream[st]>>>(d_inr[st], d_ini[st]); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // run badder kernel + badder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_added[st]); + + // if sufficient bandpasses... + if (started>0) { + + // run adder kernel + adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); + + // copy to host + cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); + + // copy to output + for (int j=0;j<12288*4;j++) { + if (test_pattern) + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); + else + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; + } + if (DEBUG && bst*NSTREAMS+st==10) { + for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); + } + + } + + } + } + + // now deal with bandpass + + // copy to host + for (int st=0;st0 && bpctr0 && bpctr>=NBP) { + + //syslog(LOG_INFO,"now using many BPs for requant"); + + // do average bp + ret_many_bp(many_bp,bp,medbp); + + started=2; + + } + + + + // finally deal with bp + for (int i=0;i<256;i++) { + + if (AGC==0) + for (int i=0;i<256;i++) bp[i] = frozen_bp[i]; + + if (bpctr<15) syslog(LOG_INFO,"coeff %d %d %g",bpctr,i,bp[i]); + if (bp[i]!=0.) { + bp[i] /= 48.*nints; + bp[i] = 2.5*128./bp[i]; + } + } + cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); + + bpctr++; + slow_down++; + if (slow_down>=20) slow_down=0; + + // write to output + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + for (int st=0;st +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "cuda_fp16.h" +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include +#include + +#include +using namespace nvcuda; + +// global variables +int DEBUG = 0; + + +// kernel for summing and requantizing +// input array has order [beam, 48 frequency, 2 pol, 16 time] +// need to output to [4 time, beam, 48 frequency] +// bp is scale factor for each beam +// run with 256*48=12288 blocks and 32 threads +__global__ +void adder(float *input, unsigned char *output, float *bp) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 256*48=12288 + int tidx = threadIdx.x; // assume 32 + //int fidx = 2*(bidx % 24); + int beamidx = (int)(bidx / 48); + + // declare shared mem + volatile __shared__ float data[32]; // data block to be summed + + // transfer from input to shared mem + data[tidx] = input[bidx*32+tidx]; + + // sync + __syncthreads(); + + // complete sum + if (tidx<16) { + data[tidx] += data[tidx+16]; // over pols + data[tidx] += data[tidx+2]; + data[tidx] += data[tidx+1]; + } + // now tidx = 0, 4, 8, 12 are what we want! + + __syncthreads(); + + // store + if (tidx == 0) + output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); + if (tidx == 4) + output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); + if (tidx == 8) + output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); + if (tidx == 12) + output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); + +} + +// kernel for promotion +/* +orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] +input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] +output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] +promoted to half precision + +launch with 16*48*NANT blocks of 32 threads + + */ +__global__ void promoter(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume 16*48*NANT + int tidx = threadIdx.x; // assume 32 + int iidx = bidx*32+tidx; + int pol = (int)(tidx % 2); + int chunnel = (int)(tidx / 2); + + /*int ant = (int)(bidx % NANT); + int time_chan = (int)(bidx / NANT); + int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ + + int chan = (int)(bidx % 48); + int time_ant = (int)(bidx / 48); + int tim = (int)(time_ant / NANT); + int ant = (int)(time_ant % NANT); + int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; + + //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); + //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); + inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); + ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + +} + +// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels +// for first time, launch with 3072, 32 +__global__ void printer(half *inr, half *ini) { + + int idx = blockIdx.x*32+threadIdx.x; + float ir = __half2float(inr[idx]); + float ii = __half2float(ini[idx]); + + int chunnel = (int)(threadIdx.x % 16); + int channel = (int)(blockIdx.x/64); + int tt = (int)(blockIdx.x % 64); + int pol = (int)(tt/32); + int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); + + if (ir!=0. || ii!=0.) { + printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); + } + +} + + +// kernel for beamforming +/* + +Assumes that up to NANT antennas (nominally 63) are populated. + +Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) + +Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di + +Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. +for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) +use __float2int_rn, cosf, sinf intrinsics. + +Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. +Do it in tiles of 16 beams and 16 ants for + +Output array has order [beam, 48 frequency, 2 pol, 16 time] + +inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag +wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + +launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization + = 24576 blocks + +*/ +__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 24576 + int tidx = threadIdx.x; // assume 32 + int orig_bidx = (int)(bidx / 16); + int beam_tile = (int)(bidx % 16); + int stuff_tile = (int)(beam_tile % 4); + int data_offset = orig_bidx*1024; // offset for first part of data + int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight + weight_offset *= 16384; + int idx1, idx2; + int f_idx = (int)(orig_bidx % 96); + int tim_idx = (int)(orig_bidx / 96); + int oidx = f_idx*16 + tim_idx; + + // shared memory for convenience + __shared__ half summr[16][16]; // beam, chunnel + __shared__ float summi[16][16]; // beam, chunnel + + // accumulate real and imag parts into [16 beam x 16 f] fragments + // Declare the fragments. + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment wr_inr_frag; + wmma::fragment wr_ini_frag; + wmma::fragment wi_inr_frag; + wmma::fragment wi_ini_frag; + wmma::fragment ib_frag; + wmma::fragment final_frag; + + + // zero out accumulators + wmma::fill_fragment(wr_inr_frag, 0.0f); + wmma::fill_fragment(wr_ini_frag, 0.0f); + wmma::fill_fragment(wi_inr_frag, 0.0f); + wmma::fill_fragment(wi_ini_frag, 0.0f); + wmma::fill_fragment(ib_frag, 0.0f); + + // IB + if (stuffants==2) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + + } + + } + + // one ant per beam + if (stuffants==1) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + + } + if (stuffants!=1) { + + // loop over ant tiles + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + // copy weight and data to fragments, and multiply to accumulators + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); + + wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); + + wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); + + } + + // form real and imaginary matrices + for(int i=0; i < wr_inr_frag.num_elements; i++) { + wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real + wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag + wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared + } + } + + // at this stage the matrices are [beam, chunnel], and need to be summed over columns + + __syncthreads(); + + // copy back to shared mem + half *p1; + float *p2, tmp; + p1 = &summr[0][0]; + wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + if (stuffants!=1) { + + // now do thread reduction using multiplication by unity + wmma::fill_fragment(final_frag, 0.0f); + wmma::fill_fragment(b_frag, 1.0f); + wmma::load_matrix_sync(a_frag, p1, 16); + wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + // store + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; + } + + + } + + if (stuffants==1) { + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; + } + } + if (stuffants==2) { + + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); + tmp = 0.; + for (int i=0;i<16;i++) tmp += summi[i][i]; + if (tidx==0 && beam_tile==0) + output[(beam_tile*16+tidx)*1536 + oidx] = tmp; + + } + +} + +// kernel to calculate weights - needed because weights are halfs +// launch with 256 threads in 6144 blocks +__global__ +void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { + + // assume 256 threads in 6144 blocks + int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile + int tidx = threadIdx.x; + int f = (int)(bidx / 128); + int cc = (int)(bidx % 128); + int pol = (int)(cc / 64); + cc = (int)(cc % 64); + int beam_tile = (int)(cc / 4); + int ant_tile = (int)(cc % 4); + int beam_i = (int)(tidx / 16); + int ant_i = (int)(tidx % 16); + + int beam = beam_tile*16+beam_i; + int ant = ant_tile*16+ant_i; + int i = bidx*256+tidx; + int widx = ant*NW*2*2 + f*2*2 + pol*2; + + float theta = sep*(127.-beam*1.)*PI/10800.; // radians + float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate + float twr = cos(afac*antpos[ant]); + float twi = sin(afac*antpos[ant]); + + wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); + wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); + + +} + + +// function prototypes +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); +int init_weights(char *fnam, float *antpos, float *weights, char *flagants); +void reorder_block(char *block); +void calc_bp(float *data, float *bp, int pr); + + +// performs massive summation to calculate bp +// input array has order [beam, 96 frequency, 16 time] +// bp has size 48 - no way to avoid strided memory access +// returns factor to correct data +void calc_bp(float *data, float *bp, int pr) { + + int i=0; + + for (int b=0;b<256;b++) { + for (int f=0;f<48;f++) { + for (int a=0;a<32;a++) { + bp[b] += data[i]; + if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); + i++; + } + } + } + +} + +// for finding median of bandpass + +int cmpfunc(const void* elem1, const void* elem2) +{ + if(*(const float*)elem1 < *(const float*)elem2) + return -1; + return *(const float*)elem1 > *(const float*)elem2; +} + +void ret_med_bp(float *bp) { + + qsort(bp, 256, sizeof(float), cmpfunc); + float medval = 0.5*(bp[127]+bp[128]); + for (int i=0;i<256;i++) + bp[i] = medval; + +} + +// performs cpu reorder of block to be loaded to GPU +void reorder_block(char * block) { + + // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] + // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] + // 24576*NANT in total. 1536*NANT per time + + char * output = (char *)malloc(sizeof(char)*24576*NANT); + + for (int i=0;i<16;i++) { // over time + for (int j=0;j= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + int nints = NPACKETS / 16; + uint64_t nbytes_per_int = block_size / nints; + uint64_t nbytes_per_out = block_out / nints; + char * block; + unsigned char * output_buffer; + output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // allocate host and device memory for calculations + //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag + //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + char *d_indata[NSTREAMS]; + unsigned char *d_outdata[NSTREAMS]; + float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; + half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; + cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions + cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights + cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs + cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass + cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight + cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight + cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); + + float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); + char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); + float *bp = (float *)malloc(sizeof(float)*256); + unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); + + // streams and device + cudaStream_t stream[NSTREAMS]; + for (int st=0;st d1(d_inr[st]); + thrust::fill(d1, d1+16*48*2*64*16, 0.0); + thrust::device_ptr d2(d_ini[st]); + thrust::fill(d2, d2+16*48*2*64*16, 0.0); + } + + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + int blockct = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + blockct ++; + + // DO STUFF + + // calc weights + init_weights(fnam,antpos,weights,flagants); + cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); + calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); + if (DEBUG) syslog(LOG_INFO,"Finished with weights"); + + if (started==1) { + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // run adder kernel + adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); + + // copy to host + cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); + + // copy to output + for (int j=0;j<12288*4;j++) { + if (test_pattern) + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); + else + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; + } + if (DEBUG && bst*NSTREAMS+st==10) { + for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); + } + + } + } + + + } + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + + // calculate bandpass + + for (int i=0;i<256;i++) bp[i] = 0.; + + // do standard bf but calculate bandpass + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + //if (bst==0 && st==0) + // printer<<<3072, 32>>>(d_inr,d_ini); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // copy back to host + cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); + + // calculate bandpass + //if (st==0 && bst==0) + //calc_bp(h_transfer,bp,1); + calc_bp(h_transfer + st*256*96*16,bp,0); + ret_med_bp(bp); + + } + } + + // adjust bandpass + syslog(LOG_INFO,"Final BP..."); + for (int i=0;i<256;i++) { + syslog(LOG_INFO,"coeff %d %g",i,bp[i]); + if (bp[i]!=0.) { + bp[i] /= 48.*nints; + bp[i] = 2.5*128./bp[i]; + } + } + cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); + + // junk into output + memset(output_buffer,0,block_out); + + } + + // write output for debug + + // write to output + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + for (int st=0;st +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "cuda_fp16.h" +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include +#include + +#include +using namespace nvcuda; + +#define sep 1.0 + +// global variables +int DEBUG = 0; + + +// kernel for summing and requantizing +// input array has order [beam, 48 frequency, 2 pol, 16 time] +// need to output to [4 time, beam, 48 frequency] +// bp is scale factor for each beam +// run with 256*48=12288 blocks and 32 threads +__global__ +void adder(float *input, unsigned char *output, float *bp) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 256*48=12288 + int tidx = threadIdx.x; // assume 32 + //int fidx = 2*(bidx % 24); + int beamidx = (int)(bidx / 48); + + // declare shared mem + volatile __shared__ float data[32]; // data block to be summed + + // transfer from input to shared mem + data[tidx] = input[bidx*32+tidx]; + + // sync + __syncthreads(); + + // complete sum + if (tidx<16) { + data[tidx] += data[tidx+16]; // over pols + data[tidx] += data[tidx+2]; + data[tidx] += data[tidx+1]; + } + // now tidx = 0, 4, 8, 12 are what we want! + + __syncthreads(); + + // store + if (tidx == 0) + output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); + if (tidx == 4) + output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); + if (tidx == 8) + output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); + if (tidx == 12) + output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); + + /*if (tidx == 0) + output[bidx] = (unsigned char)(__float2int_rn(data[0])); + if (tidx == 4) + output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4])); + if (tidx == 8) + output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8])); + if (tidx == 12) + output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]));*/ + +} + +// kernel for promotion +/* +orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] +input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] +output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] +promoted to half precision + +launch with 16*48*NANT blocks of 32 threads + + */ +__global__ void promoter(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume 16*48*NANT + int tidx = threadIdx.x; // assume 32 + int iidx = bidx*32+tidx; + int pol = (int)(tidx % 2); + int chunnel = (int)(tidx / 2); + + /*int ant = (int)(bidx % NANT); + int time_chan = (int)(bidx / NANT); + int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ + + int chan = (int)(bidx % 48); + int time_ant = (int)(bidx / 48); + int tim = (int)(time_ant / NANT); + int ant = (int)(time_ant % NANT); + int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; + + //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); + //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); + inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); + ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + +} + +// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels +// for first time, launch with 3072, 32 +__global__ void printer(half *inr, half *ini) { + + int idx = blockIdx.x*32+threadIdx.x; + float ir = __half2float(inr[idx]); + float ii = __half2float(ini[idx]); + + int chunnel = (int)(threadIdx.x % 16); + int channel = (int)(blockIdx.x/64); + int tt = (int)(blockIdx.x % 64); + int pol = (int)(tt/32); + int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); + + if (ir!=0. || ii!=0.) { + printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); + } + +} + + +// kernel for beamforming +/* + +Assumes that up to NANT antennas (nominally 63) are populated. + +Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) + +Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di + +Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. +for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) +use __float2int_rn, cosf, sinf intrinsics. + +Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. +Do it in tiles of 16 beams and 16 ants for + +Output array has order [beam, 48 frequency, 2 pol, 16 time] + +inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag +wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + +launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization + = 24576 blocks + +*/ +__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 24576 + int tidx = threadIdx.x; // assume 32 + int orig_bidx = (int)(bidx / 16); + int beam_tile = (int)(bidx % 16); + int stuff_tile = (int)(beam_tile % 4); + int data_offset = orig_bidx*1024; // offset for first part of data + int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight + weight_offset *= 16384; + int idx1, idx2; + int f_idx = (int)(orig_bidx % 96); + int tim_idx = (int)(orig_bidx / 96); + int oidx = f_idx*16 + tim_idx; + + // shared memory for convenience + __shared__ half summr[16][16]; // beam, chunnel + __shared__ float summi[16][16]; // beam, chunnel + + // accumulate real and imag parts into [16 beam x 16 f] fragments + // Declare the fragments. + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment wr_inr_frag; + wmma::fragment wr_ini_frag; + wmma::fragment wi_inr_frag; + wmma::fragment wi_ini_frag; + wmma::fragment ib_frag; + wmma::fragment final_frag; + + + // zero out accumulators + wmma::fill_fragment(wr_inr_frag, 0.0f); + wmma::fill_fragment(wr_ini_frag, 0.0f); + wmma::fill_fragment(wi_inr_frag, 0.0f); + wmma::fill_fragment(wi_ini_frag, 0.0f); + wmma::fill_fragment(ib_frag, 0.0f); + + // IB + if (stuffants==2) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + + } + + } + + // one ant per beam + if (stuffants==1) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + + } + if (stuffants!=1) { + + // loop over ant tiles + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + // copy weight and data to fragments, and multiply to accumulators + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); + + wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); + + wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); + + } + + // form real and imaginary matrices + for(int i=0; i < wr_inr_frag.num_elements; i++) { + wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real + wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag + wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared + } + } + + // at this stage the matrices are [beam, chunnel], and need to be summed over columns + + __syncthreads(); + + // copy back to shared mem + half *p1; + float *p2, tmp; + p1 = &summr[0][0]; + wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + if (stuffants!=1) { + + // now do thread reduction using multiplication by unity + wmma::fill_fragment(final_frag, 0.0f); + wmma::fill_fragment(b_frag, 1.0f); + wmma::load_matrix_sync(a_frag, p1, 16); + wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); + + __syncthreads(); + + // store + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; + } + + + // do thread reduction for each beam + /* if (tidx<8) { + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1]; + } + if (tidx>=8 && tidx<16) { + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8]; + } + if (tidx>=16 && tidx<24) { + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16]; + } + if (tidx>=24) { + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24]; + }*/ + + /*if (tidx<16) + for (int j=1;j<16;j++) summr[tidx][0] += summr[tidx][j]; + + __syncthreads();*/ + + // now summr[beam][0] can go into output + /*if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0]; + }*/ + + } + + if (stuffants==1) { + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; + } + } + if (stuffants==2) { + + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); + tmp = 0.; + for (int i=0;i<16;i++) tmp += summi[i][i]; + if (tidx==0 && beam_tile==0) + output[(beam_tile*16+tidx)*1536 + oidx] = tmp; + + } + +} + +// kernel to calculate weights - needed because weights are halfs +// launch with 256 threads in 6144 blocks +__global__ +void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { + + // assume 256 threads in 6144 blocks + int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile + int tidx = threadIdx.x; + int f = (int)(bidx / 128); + int cc = (int)(bidx % 128); + int pol = (int)(cc / 64); + cc = (int)(cc % 64); + int beam_tile = (int)(cc / 4); + int ant_tile = (int)(cc % 4); + int beam_i = (int)(tidx / 16); + int ant_i = (int)(tidx % 16); + + int beam = beam_tile*16+beam_i; + int ant = ant_tile*16+ant_i; + int i = bidx*256+tidx; + int widx = ant*NW*2*2 + f*2*2 + pol*2; + + //float theta = sep*(127.-beam*1.)*PI/10800.; // radians + float theta = sep*(127.-beam*1.)*PI/10800.; // radians + float afac = -2.*PI*freqs[f*8+4]*sinf(theta)/CVAC; // factor for rotate + float twr = cos(afac*antpos[ant]); + float twi = sin(afac*antpos[ant]); + + wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); + wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); + + +} + + +// function prototypes +int dada_bind_thread_to_core (int core); +int init_weights(char *fnam, float *antpos, float *weights, char *flagants); +void reorder_block(char *block); +void calc_bp(float *data, float *bp, int pr); + + +// performs massive summation to calculate bp +// input array has order [beam, 96 frequency, 16 time] +// bp has size 48 - no way to avoid strided memory access +// returns factor to correct data +void calc_bp(float *data, float *bp, int pr) { + + int i=0; + + for (int b=0;b<256;b++) { + for (int f=0;f<48;f++) { + for (int a=0;a<32;a++) { + bp[b] += data[i]; + if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); + i++; + } + } + } + +} + +// for finding median of bandpass + +int cmpfunc(const void* elem1, const void* elem2) +{ + if(*(const float*)elem1 < *(const float*)elem2) + return -1; + return *(const float*)elem1 > *(const float*)elem2; +} + +void ret_med_bp(float *bp) { + + qsort(bp, 256, sizeof(float), cmpfunc); + float medval = 0.5*(bp[127]+bp[128]); + for (int i=0;i<256;i++) + bp[i] = medval; + +} + +// performs cpu reorder of block to be loaded to GPU +void reorder_block(char * block) { + + // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] + // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] + // 24576*NANT in total. 1536*NANT per time + + char * output = (char *)malloc(sizeof(char)*24576*NANT); + + for (int i=0;i<16;i++) { // over time + for (int j=0;j= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // get block sizes and allocate memory + uint64_t block_size = 198180864; + uint64_t block_out = 15*48*512*256; + char * block; + block = (char *)malloc(sizeof(char)*block_size); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + int nints = NPACKETS / 16; + uint64_t nbytes_per_int = block_size / nints; + uint64_t nbytes_per_out = block_out / nints; + unsigned char * output_buffer; + output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); + memset(output_buffer,0,block_out); + + // allocate host and device memory for calculations + //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag + //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + char *d_indata[NSTREAMS]; + unsigned char *d_outdata[NSTREAMS]; + float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; + half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; + cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions + cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights + cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs + cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass + cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight + cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight + cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); + + float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); + char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); + float *bp = (float *)malloc(sizeof(float)*256); + unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); + + // streams and device + cudaStream_t stream[NSTREAMS]; + for (int st=0;st d1(d_inr[st]); + thrust::fill(d1, d1+16*48*2*64*16, 0.0); + thrust::device_ptr d2(d_ini[st]); + thrust::fill(d2, d2+16*48*2*64*16, 0.0); + } + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + int blockct = 0; + + syslog(LOG_INFO, "starting observation"); + + // init weights + init_weights(fnam,antpos,weights,flagants); + cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); + calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); + if (DEBUG) syslog(LOG_INFO,"Finished with weights"); + + // open data file and read first block + FILE *fin; + fin=fopen(finnam,"rb"); + fread(block,sizeof(char),block_size,fin); + fclose(fin); + + // calculate bp + for (int i=0;i<256;i++) bp[i] = 0.; + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // copy back to host + cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); + + calc_bp(h_transfer + st*256*96*16,bp,0); + ret_med_bp(bp); + + } + } + + + // adjust bandpass + syslog(LOG_INFO,"Final BP..."); + for (int i=0;i<256;i++) { + //syslog(LOG_INFO,"coeff %d %g",i,bp[i]); + if (bp[i]!=0.) { + bp[i] /= 48.*nints; + bp[i] = 2.5*128./bp[i]; + } + } + cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); + + // open data file and read first block + fin=fopen(finnam,"rb"); + + // re-open file and loop over blocks + while (blocks<15) { + + syslog(LOG_INFO,"read blocks %d",blocks); + fread(block,sizeof(char),block_size,fin); + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // run adder kernel + adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); + + // copy to host + cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); + + // copy to output + for (int jj=0;jj<4;jj++) { + for (int bmn=0;bmn<256;bmn++) { + for (int j=0;j<48;j++) { + output_buffer[blocks*512*48*256 + (bst*NSTREAMS+st)*48*4*256+ jj*48*256 + bmn*48 + j] = tmp_buf[256*48*4*st + jj*256*48 + bmn*48 + j]; + } + } + } + + } + } + + blocks++; + + } + + syslog(LOG_INFO,"blocks %d",blocks); + + fclose(fin); + + float pwrs = 0; + if (!outpwr) { + fin=fopen("/home/ubuntu/data/tmp/output.dat","wb"); + for (int i=0;i<8192;i++) + fwrite(output_buffer + i*48*256 + outbm*48,sizeof(unsigned char),48,fin); + fclose(fin); + } + else { + fin=fopen("/home/ubuntu/data/tmp/output.dat","w"); + for (int i=0;i<15*512;i++) { + for (int j=0;j<256;j++) { + pwrs = 0.; + for (int k=0;k<48;k++) pwrs += (float)(output_buffer[i*256*48 + j*48 + k]); + fprintf(fin,"%f\n",pwrs); + } + } + fclose(fin); + } + + + + for (int st=0;stbL^AqF6`^PVuym!vM=iGD7 zJ@?#mmlX89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+ z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK) z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_ zaMtWuQ_nm1tl87fm_2K5Wz`u~L(eMu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs- z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-?? zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-je0pB=F~>67WAff&5ts z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaMrSttfVTZ$g#YwpWe*xsz66~ z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7 z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5( z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f zZB3XqyO2!S>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G zD^!tR5*$4Re}b?Cb?(M3?`)woE2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~; zbhGGEFofSkj#6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<; z1Fc}XEccfrnY$ zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?} zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^ z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un> z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa z#Rh(afuCvMFEj9E2Hre)tupWi-BKY;9Cv+)dqg6f!BM3EN(OK9LM4wgL z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j} z2|p^}sf3v-I;#XcfiP1;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR? z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ( zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4 zrZi5vfR822RK`gY@DYTW!Z6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+ z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1 zbE)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_h6!6)E zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz z$l0WOHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i= z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@ zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M zo9!<3~_9GzWHsD%;uh}Ha;hG*%s!>)Y#3OYwnq%rXr< zz#=a)JHy)NpU^*Jx06j%R+apl4 zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA; zuXzXQy{ffq^%puI^@>v);6T2$)hLdHhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70 z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia zxT%Nel>M;#sT&N^4wP=gV{%L`1708Exw9l3dII{C2&17(|V9;eB zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx zTrEUR!7=g<1zEedt1CZxQR zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD z3DsVc7w=gBI4WMp%Gn+1_E8O3&g2v89DaIJ6E zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^ z5Vu#WgrKdlu?Mm3~pJRAqQA!D6Jr} z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;PkL{f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y& z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e> zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-yz2gE1IrYFXbA8s z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R zf*#6N-BUjM!>-k!Q`5x9kiq9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~ z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>| zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_ zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S znF^jXAYME1Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1 z?sUNzu+*_GVNq!2Hgsic?L13(c7Yk&mrt z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ* z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#} z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0| zd+bxN|2_*l>Qk`SzCC5x3#>N-LRHJ+QYcAAi^r=Rt`fEj5e~F%yCH|V_0^MdU zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO& z5T^xp18U3iSuEej#&9C!a~{D3pTB^j3-RFW&lN|m|4D4>H69l? z^%S5G3t69i(RNN}62Ku>Qo#k1sFrF^$tG$x*1}yoQXfaUAR{ zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<e=l1~n z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10 zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE} zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&hECqpaFjLTT%YF(m94-{4*ZdR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw49kE(A-l5eLDnIv#OgLdJe#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L( zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3 z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0 z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqab zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)9l&62O3cpb{uoGq1Guu@FVU(&>D8u>ZWG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA zEd&PlJ?sNuz>}!-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8 zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3eG0vEow zHP(YFZUMOfbi&8mH`>ebZsgMi913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f z@i*&ufU+-ru=Wp(*9Xu4ZocT| zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E% zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7 zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8 z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=X=S2UV>mw~#{8dA2u0t-k zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+ z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb zn%-R~(7Ow={s1jn^gL_JbrxeUcX_N{BBJoDic=!h)> zf;?}vo!~2`p2J*u$Z=ggb+n3u%K&5h+eI#2foF(Bwrv z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2 zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG= z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5Mz>AtbgkCFCt zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7 z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y- zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2 zs<4-~(+9~Ey+j{kC+#UB zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5UrvG3)hRgaX5^o! z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~ zDBr(L=yyEJKk+EvCdG)w29HoWaozFL+~hC?^3@hHDb zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+hPP^6|1< z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BTy`=eDC%s-~XD>U&#NQNBK5cF67_g zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{#eF6{rFNBKfoF8pu3 zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^ zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT`d%h@Q<^fB4DLnfz0 z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ z{k^)jJ^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{ z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw96TmdeCo&Os4*d zK2HB96T`{-1~bq&M`3J5c{CJBEEpN_n==+dP9FjAE&>jALKzlk@SZCiat(% zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?cJzX$zt(#PV*Bt>uf z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe% z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH> z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8 z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(! zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8gSC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x zH!gpe-<+Jmt8uxJNf~J)acyQmKCX!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c zK&3yaN~epPMd7X_TNb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU z1=W(`u@pGjv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{ zblmp7@-o^hs88!M47+jiVPPU0m+O=!G{V!GHGvWph06 zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag` zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS z=t4fXT1mR?0MX58L`m8lu=w7kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1 zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKmlrTlgYmbPAqa!!vM?S�f{gAvg(Eu zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r* zvaK67htWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89 z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75< zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e> zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{> zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h33fym zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44 zuQk621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_> z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1 z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L) zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-rcL z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W2N3(rGtYVr`k?A=DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!#vh*%TH5%VNeqHiN4 zB9CM)85KDwapSltgq;*_o`@0|_vf<S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7)A$&>*6*`oB4fI zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@) zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|@;wQ;!Nt zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB z|2Ix}C`ZcXWv<}nrokdaSwhqrX^Jtx{-P) zoDc!mEeI4E(OJjSLQ8sYhnfPYYbg327#%6a7{ee@Qu+Sn0Pw8TP9=( z%OVK4Q=jdT)|5Z8c{_h*L6L9V( zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj>-O$ygMq0N7 zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a z9=pSp+033!WCQW7Xu`j$k zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O! z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@ zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMfo9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We` zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~ z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9& z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?qYFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD z(1@V1TM&U|7xSOFqb_JqY(V}*0YiBiy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH zb(FuUWOp8vzD+}Wi{A-ay|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW! z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c_=1Q2 zcbago7$fuImF%y5^ktS{o9Qq?2j{4 z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji zIQ5XrMO6XorATY+FsZd&SlA_&Qhu%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE& z*v($uLGfRk#hg|ioc_JcVv;NnB>#_iB>U(&LNm49G{u{HH z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz! zNJ#7a4c$r#6>&~%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u zCxA1m+~o5_D6W&wmI+8cuY=pk!g9b5QpVl$L2uRjb=TuUG>a zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6 z{C0EM zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5( znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg& z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n< zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2 zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I! znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7= zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJH8K zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${ zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL zfyT? zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~ zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`* zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?FM9r)2dV08UR3-C1;rCa)>CW4WOWa{& z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a3@4@4} zUI+8oc8S4*-V5tj@o~_L>%Z?HhYdjlCwj zuY4f`_l@A1@q_m;l-aG9xLL6 zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2 z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`lJD0qgVKH zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7nVRat z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt| z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB zu2YA)P7R!x{Q1z$JlLlWbvz?th6WE9X|LE)UCrEoo zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u-Vo$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g>b3#khu&@yEW6 z3DL=eXYdzT#mR%`@I=?igA?j$`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v? z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+ ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@Rm31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2 zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB zB@U<}c$AY9n>o!o&=l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV z@wyUjmH7l^J`D+F!tdCT0?x zS>d5jIy4Cd#cNKaptnN;$G)98TK^n7IFRCB;x_T|rs=fSIkdWs z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2 zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x|MAOA;u%nlVS^@#&u%Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8 z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4% za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8 zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3 z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>RMo7{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0 z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9 zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});> zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@ z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~Rluf9XF8b;hei=W zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9 zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U! zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4 zTh3S5dcMN8^A)Z*Ut#3Fl z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{ zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3 z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q| z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwHLOECuIRY|3L z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?) zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D| zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC( zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4 z^^d9gBNn}dgsF4^K6@%}4Yw3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4 zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta20p5Z=GT*pDOL--(nA)-%6cueq44L! z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y1>R&SG8` z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&> z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x2q!NbAud&FO4K9mam7WxPHtAqz z#8=Qkw*o78Y|FRF^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0 zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR6)u*a4ds_b0cO5 zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+ zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;) z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEcsav^{*99hPVLCFo5M+(ia+1be+ z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El zi(Uyw+PsFhfO~uTZoUN+=g>S z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^ocE7l6l2W`!0UFA@b|9FJnGiFm|~+ zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*Ia zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+ zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96 z$I+*uR8BMw?P(ks>L2C@GdYHhllcHPZ*)BvDO$b=r9lZ1 zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG! ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+ z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^ zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E& z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$ zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0 zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5 zJ?^Zo7Cx+!Y>iLT6E~R1pC34eJv$5j|5o{- z0{FjE`6}C{|5=rPrtPol@#pSSXtsC;YB)XHCp43DT)wVm50 zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3cKyJZ&H55#ZT4lZRk=g;_z)ub)C)= z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCNJ&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ( zb|)1-rnUF5gm}H}ov|8H_YuX$V)0L?_NDA(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y! zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL*G>r*oR6V*; zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?< z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$ zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U( z0C?~A4A(|E)_+hR_81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@yXjngCUsbsv>!Y9aQP!Wr$|Zgf zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf>zQ`W+zV*3k?_BsN zO7q3=z5w;lPyGWwsedeA;`aon*1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X z0qAXn{Rp-n57`gKyQOB3ii-Qx%388u6IyqSty_8hNU=`B_E z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz= z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@p% zDuVv|F|gyJ{==Wt=0q7RMTB)pKk%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw zAO(YYOaD>AzZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2 zN6FWKKV}&CFGF>4B@FhD{)`F`Dkx3- z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hdpll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO& zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO z1Ku0VbJ91<@w-}FRRv9;{?dFG z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>LC(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@ zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9B(f8d zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$= zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1 zKO_FD`ALZSKZC!GZQYhft@2n-%9r^Cc^YO! ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl z!wHWOt~7@CxS^#=^J%c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l|51_vgvPgu@_c_=A^hZT$tKA^ z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw z2<vOLDvweY{m`&eCqOVL1sDS77 zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr z<=?Ml{i!u$dHPA~g(M1|zrx` zBkb@O@R3}M=Qnac5%YmuK&B!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{ zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1THj)#3xD*Gm}I z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z# zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF zk^H|8>sb`8Grz&Vqh`E&BOaUBEEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&-LF7lv# zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7_TmP7UUO9bY{w#((=NHghRN=Ho_I8m& zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^JTAoB%I6f& z<8k`K_JHY=4UaFILont%4?Wb?9o zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+ zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{ z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A zsWt5K=Oz6l5EuP!dYFDZ0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO zB>fIOOn;^H_rM|bqhtXw)S~R4x)S*A-(9i8hF&d~#*#j|{+wUiS z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f z4u9nOECaAUl*CA2vj5f8e`dJT{}@eYClfbO`0nfkx z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89? z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA zDVh(@?UyiiLiRUA==!%Qt zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!-b;Rg{<_>MhHqE;kND7u z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L zGBSM%l#|LSdiMcVE1dXEnOv&u;m{qO1+>=F2*IgY=T{-T1$Z}t}$ zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^ z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4mu_Q~nC@ zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s z^^QG}y}0bT_8Q3NJofnFA=G!-NG>7^S z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B zz0mmVbLrmRA5+D! z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+ z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j zbP76OY?%6-ISh|9)~jmF1qWpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8& z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8 zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{ zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&420q>( z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjkos_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS z!&F{gAx@|?8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX& zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L z&_2i>9%LMy zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5oAsOY2l{KMG|Lo z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@ zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ zI}QDfD|q(D(BBc2o=tzTt|;|i>F{+?6m+4Q$o z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9 zckiX+fPg1{u>Elk6 zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6 zKc#%WM3x^xIrr4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$= z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT z`z6-Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7 z*7;#-1oEWyBEBe%^`<#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT z)c1U%E_MiI=7Yh z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6 zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw% zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~ z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8 zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(! ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+- z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{ zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z% zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I* zc%Mvx1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^ z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M&@OL^|A`&}|+-QD$A z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w z`i%O4{l6|AS%^;@jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55 zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~ zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7RbVvq!7C z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH& z_<<&X?;hUMIQj$evIcI{^+5qrjk(K2St z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6% zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2} zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjUY`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K zbet&rIezk7%wPFwt>pcBH1TnK6QkV(p5dWMWnw;YwAzLzktoAp?*!=rZ7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA-8J7HZ;?;;kmM4>@yM zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9 zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr` zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9| zL~v11|9T+2IOy{{>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENOKBJkiuYYD-cpO@B?I?XTea z;zRz%OyWyIUtd!4C87OBh|fjzw-gh`CjeTK=F?w+I61N zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP*iFtI(d4k2Pk%%}f1zt*UUPiPuG ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?Traa=`eLW2Z1+Ux1oSsjTor^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>UouSx6?%s_UKi!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8 z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv z!2c`}NueW<{%MK-x24*KfEXwXKQtVTqt2)OKkU7CbX3(IKl zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?OjwOklc6Q zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9 zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`= zK<^rMK(~DTP{vGAG&wK~}Xrknd zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3 zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wRE=WVFjSzg zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr) zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB6Qy1ru~=%J0|`e4?Z0Nobi6bsccnu zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG79CeUe8p0AVPMJpz^l-115-}%O&mDtMOa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}aOJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i z-n|}I}i!(G}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_ z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE; z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C z33S+w4oGh{6xqqDe~SniuFr za-N+($++Dv^e+Fe9=eBlfvHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3 zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1 zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i-m zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2 z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2 z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)? zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3! zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac z97E0TOG-g8$VVw2$WAbv(KagxP&)tAt@AksheCB8Bs5 zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L z#9vbQR=6bIb)cTvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{< zB@g8Pbh#t{bRlKaGDOObwYnpR;?T z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3 z=`mZ5@K~u+$-{q?19MZM-JxpbKPFXB(Y`cyhu z5kV+sY-1{YEZN|J?Tpw>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ` zOZwE_@W&_>K?8FwuOcOsA(ivjk8()vGU+es z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_ zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8 zln!&t6nHwD_abZK6ha>QjxEeJ$ti}Ir*t^hdrwY0g{ZtYJB5bⅈLtVZ+07i53u_G~b3r)s!SYf}Ev4 zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y68i2k)}w)z9{EiU1c5!5Sq2C+L` zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6 z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf< zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0 zUs5tj6|2~xY-NbAsFy+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$- zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z> z(OMU>SV9DGq`?-EC?yZUhb?keaJDncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM* z&%}|O%Q1CKNFOe*LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}d z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0IMT1GVbE1245kFbQ zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ? zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4Vu<(0hIWkISeG_ zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5 zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH z%5m^9ql!X29yH3aDCpy0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps z~GyqK)-rtlkV3XSa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db zCQP0!=~*nr75v@cE%O+Xp>c6Rwt&KEtVujihn3E$Ie$CMITPntJd1%9OXfL$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K; zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValaztR0UBMjSstghdsJ5T`|HT)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{< zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@ z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr ziPjej===WcG~#oI`F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj z9jyI0gjXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$tJX{{}a1S*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx08g5f~%!6D8S7fN)?H1>{_f&I23 zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92 zA@^*>SZK@$XU!@L9UNx$dG2urK^;3Bs@$p># z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7 zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%! zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+ z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%lqL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78 zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3 zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gpzJF0i5MQ{| z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN# zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~ zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g>b$LSV}&CV?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo& zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+ z^E!QFNo%*!DKg5lZ7 zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~ zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9 zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUgi8%a>RY{2B@NiA-%DU>> z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOaw@_+-#@jr-WYF$P{~@;l(bJ zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G* zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3 zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@ zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@ zO@>euGu|$}Od3^S|lxH(uMnag^+% z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n z?Hnh90Ev8l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv zs`^z`r?r|bw3-qzJ63c8G zcgVTJeR-E#Ru09xNXg7(u#+L;_CeYH)JUe|I;yrfEf zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K( zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD3_u%8pdQZ7`11JDWA`+_NM~4 zHNoc_*U;2@`s7s4xf55Sf?C)O!!63;zj7NN(Di%UW%vK=0lDGfVC zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN zPT&&Y-Z5?MJAnJfwzaw6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v- za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;>*wxDfd3l`sz+b`8t} zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d! z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm5TkT|- z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@sZSsJ64xFng)S21#){K_oxF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO# zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ z3;cfIe%-NtK6ZDjaqM%4I#$P zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(ErY(qYK|xAk#7 z~KyJm5+Cvp*`vU zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ z+W$a&$=iDtIoAJ)V*z>5`*1i>eRFRL( zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%FLC0O)}rU0*u zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He z8*Ib>TfzSf{286hPmP&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7} zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0 zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+VkDG4=KjX5V^XFFZPXT{WNArsf<3{W$MDVU z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~ zk8F4>eI^| z;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5OguzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5` zW=~st-%k9zxIeHEOfO=spu2Pv@%=LRORzqjg#YdYKl%Qi z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9Ozx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb! zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW zHb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#( z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm|*5URUG%o2z>v2uI0fkIx? z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@* zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuTPFgkA@?~!~zqGO{_3+j)@Ty zSDLuN#4AnQZQ_F_?lAr~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0 z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{ zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7 zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ z)(E552_V5Ak)8TO^cQ*yvb+55HXKB?tWP zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@ zyYPSz@f?PVB?=>5OBTEKSp_ray9Rv-eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8 z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0 zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`JV;gPjR&Z8;*tJnU9VYp8U^4ZM zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r-IWir% z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r?%UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^ zlVm3CJZG+w#Fdgz8| zZ-G&!&4qfOAwNaCURv>pg>@70gnM;rt6QUyBYYXF4b$>xc`tm5 z_x4PjeA(Tf9K1{tr5N&mhHAoim&vA-~ zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}? z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY} z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWVX$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv zVN{={Bes&)eJjD z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9 zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv& zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#* z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e; zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y z#z7NMAsAD}rqH@irl`H>U78A{Z%oV~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo! zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o}Z*U zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X)z-N~b} zFxPiGS|(u^UelQF<{T^B=CE$d`NRJ($HxC?!-N$!qr+pupF2!{MivkB+R^8AwZu;LWzsA{6*xwg)M#st* z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1 z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r zoWF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($ z7L)EY*RPXI`6o`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4 zJXe|WCHIvXdNH>sp4(0NSoz<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X ze(2PwJ+biq#k9ZIEm3-GRnPVT|}wWj=*lcVzg zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7b(a|``4S{ zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq| zf1@d1aY7yhdH+U3Cj9B`6Q!GE z8iDd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9 z6u)+@;2S=0gyEb;R+VX52~P| zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo- z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae< zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^ zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr? z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`VqSHz@u?4A+~`>7NqdKhk`@ z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2 z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ% zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a-> z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ zSa(Q6@z9{?L^j@nMGdIRE-s zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9f^=n(23=jJTe zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66 zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0 z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7 zSoIc`s(F9OE8@-mhUd3doIdAv|hpUy*FYks3qt)vRA zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)& zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{} zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV= zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{ z@Zo@0aKJ6#(*omvV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn3lYA-UDTN!4pqi{ukuP}2gx9uys_<+ea&bv6tCXM)t5 zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${ z)QCMkh_AaM-WXT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7 zn+ApWi)FaPcA;ivuvkE8 z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7tAOQN3AY{^m8 z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M) zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^a`;BfI z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e> zTWdTCb<(m8XcW8*ubp5i?uWw z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$ zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~; z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+ zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p z8)XIC=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;FXv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~ zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX zIZ3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQeo+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkww9^3@(?ul8Dk4L!#M&`0^PrpD ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE03?uwb!Y6Yj%W@MY0CYL?N=!)V z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3 zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0 zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$ zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonPei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0X5I&g{hWH!n{?&8Anw zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5 zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg zM+k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y& zAUFnXo1^}h*Y zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~ z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^ zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w) z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C z4c|I66g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`} z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA* zPq}siGxH68?DpRa7-5?6+rNH52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*43GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ z&%Uxx%D>Yf>e2v`9W;hLBeHUPH S<)8BzNq +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "cuda_fp16.h" +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include +#include + +#include +using namespace nvcuda; + +#define sep 1.0 + +// global variables +int DEBUG = 0; + + +// kernel for summing and requantizing +// input array has order [beam, 48 frequency, 2 pol, 16 time] +// need to output to [4 time, beam, 48 frequency] +// bp is scale factor for each beam +// run with 256*48=12288 blocks and 32 threads +__global__ +void adder(float *input, unsigned char *output, float *bp) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 256*48=12288 + int tidx = threadIdx.x; // assume 32 + //int fidx = 2*(bidx % 24); + int beamidx = (int)(bidx / 48); + + // declare shared mem + __shared__ float data[32]; // data block to be summed + + // transfer from input to shared mem + data[tidx] = input[bidx*32]; + + // sync + __syncthreads(); + + // complete sum + if (tidx<16) { + data[tidx] += data[tidx+16]; // over pols + + data[tidx] += data[tidx+2]; + data[tidx] += data[tidx+1]; + } + // now tidx = 0, 4, 8, 12 are what we want! + + __syncthreads(); + + // store + if (tidx == 0) + output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); + if (tidx == 4) + output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); + if (tidx == 8) + output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); + if (tidx == 12) + output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); + +} + +// kernel for promotion +/* +orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] +input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] +output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] +promoted to half precision + +launch with 16*48*NANT blocks of 32 threads + + */ +__global__ void promoter(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume 16*48*NANT + int tidx = threadIdx.x; // assume 32 + int iidx = bidx*32+tidx; + int pol = (int)(tidx % 2); + int chunnel = (int)(tidx / 2); + + /*int ant = (int)(bidx % NANT); + int time_chan = (int)(bidx / NANT); + int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ + + int chan = (int)(bidx % 48); + int time_ant = (int)(bidx / 48); + int tim = (int)(time_ant / NANT); + int ant = (int)(time_ant % NANT); + int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; + + inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); + ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); + +} + +// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels +// for first time, launch with 3072, 32 +__global__ void printer(half *inr, half *ini) { + + int idx = blockIdx.x*32+threadIdx.x; + float ir = __half2float(inr[idx]); + float ii = __half2float(ini[idx]); + + int chunnel = (int)(threadIdx.x % 16); + int channel = (int)(blockIdx.x/64); + int tt = (int)(blockIdx.x % 64); + int pol = (int)(tt/32); + int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); + + if (ir!=0. || ii!=0.) { + printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); + } + +} + + +// kernel for beamforming +/* + +Assumes that up to NANT antennas (nominally 63) are populated. + +Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) + +Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di + +Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. +for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) +use __float2int_rn, cosf, sinf intrinsics. + +Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. +Do it in tiles of 16 beams and 16 ants for + +Output array has order [beam, 48 frequency, 2 pol, 16 time] + +inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag +wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + +launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization + = 24576 blocks + +*/ +__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { + + // get block and thread ids + int bidx = blockIdx.x; // assume 24576 + int tidx = threadIdx.x; // assume 32 + int orig_bidx = (int)(bidx / 16); + int beam_tile = (int)(bidx % 16); + int stuff_tile = (int)(beam_tile % 4); + int data_offset = orig_bidx*1024; // offset for first part of data + int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight + weight_offset *= 16384; + int idx1, idx2; + int f_idx = (int)(orig_bidx % 96); + int tim_idx = (int)(orig_bidx / 96); + int oidx = f_idx*16 + tim_idx; + + // shared memory for convenience + __shared__ float summr[16][16]; // beam, chunnel + __shared__ float summi[16][16]; // beam, chunnel + + // accumulate real and imag parts into [16 beam x 16 f] fragments + // Declare the fragments. + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment wr_inr_frag; + wmma::fragment wr_ini_frag; + wmma::fragment wi_inr_frag; + wmma::fragment wi_ini_frag; + wmma::fragment ib_frag; + + // zero out accumulators + wmma::fill_fragment(wr_inr_frag, 0.0f); + wmma::fill_fragment(wr_ini_frag, 0.0f); + wmma::fill_fragment(wi_inr_frag, 0.0f); + wmma::fill_fragment(wi_ini_frag, 0.0f); + wmma::fill_fragment(ib_frag, 0.0f); + + // IB + if (stuffants==2) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); + + } + + } + + // one ant per beam + if (stuffants==1) { + + wmma::fragment c_frag; + wmma::fragment d_frag; + wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); + wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); + wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); + + } + if (stuffants!=1) { + + // loop over ant tiles + for (int ant_tile=0; ant_tile<4; ant_tile++) { + + // copy weight and data to fragments, and multiply to accumulators + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); + wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); + + wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); + + wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); + wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); + + wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); + wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); + + } + + // form real and imaginary matrices + for(int i=0; i < wr_inr_frag.num_elements; i++) { + wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real + wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag + wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared + } + } + + // at this stage the matrices are [beam, chunnel], and need to be summed over columns + + // copy back to shared mem + float *p1, *p2, tmp; + p1 = &summr[0][0]; + wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); + + if (stuffants!=1) { + + // do thread reduction for each beam + if (tidx<8) { + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2]; + for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1]; + } + if (tidx>=8 && tidx<16) { + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8]; + for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8]; + } + if (tidx>=16 && tidx<24) { + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16]; + for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16]; + } + if (tidx>=24) { + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24]; + for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24]; + } + + __syncthreads(); + + // now summr[beam][0] can go into output + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0]; + } + + } + + if (stuffants==1) { + if (tidx<16) { + output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; + } + } + if (stuffants==2) { + + p2 = &summi[0][0]; + wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); + tmp = 0.; + for (int i=0;i<16;i++) tmp += summi[i][i]; + if (tidx==0 && beam_tile==0) + output[(beam_tile*16+tidx)*1536 + oidx] = tmp; + + } + +} + +// kernel to calculate weights - needed because weights are halfs +// launch with 256 threads in 6144 blocks +__global__ +void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { + + // assume 256 threads in 6144 blocks + int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile + int tidx = threadIdx.x; + int f = (int)(bidx / 128); + int cc = (int)(bidx % 128); + int pol = (int)(cc / 64); + cc = (int)(cc % 64); + int beam_tile = (int)(cc / 4); + int ant_tile = (int)(cc % 4); + int beam_i = (int)(tidx / 16); + int ant_i = (int)(tidx % 16); + + int beam = beam_tile*16+beam_i; + int ant = ant_tile*16+ant_i; + int i = bidx*256+tidx; + int widx = ant*NW*2*2 + f*2*2 + pol*2; + + float theta = sep*(127.-beam*1.)*PI/10800.; // radians + float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate + float twr = cos(afac*antpos[ant]); + float twi = sin(afac*antpos[ant]); + + wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); + wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); + + +} + + +// function prototypes +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2); +int dada_bind_thread_to_core (int core); +int init_weights(char *fnam, float *antpos, float *weights, char *flagants); +void reorder_block(char *block); +void calc_bp(float *data, float *bp, int pr); + + +// performs massive summation to calculate bp +// input array has order [beam, 96 frequency, 16 time] +// bp has size 48 - no way to avoid strided memory access +// returns factor to correct data +void calc_bp(float *data, float *bp, int pr) { + + int i=0; + + for (int b=0;b<256;b++) { + for (int f=0;f<48;f++) { + for (int a=0;a<32;a++) { + bp[b] += data[i]; + if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); + i++; + } + } + } + +} + +// performs cpu reorder of block to be loaded to GPU +void reorder_block(char * block) { + + // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] + // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] + // 24576*NANT in total. 1536*NANT per time + + char * output = (char *)malloc(sizeof(char)*24576*NANT); + + for (int i=0;i<16;i++) { // over time + for (int j=0;j= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + hdu_out2 = dada_hdu_create (0); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + int nints = NPACKETS / 16; + uint64_t nbytes_per_int = block_size / nints; + uint64_t nbytes_per_out = block_out / nints; + char * block; + unsigned char * output_buffer; + output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // allocate host and device memory for calculations + //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag + //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] + char *d_indata[NSTREAMS]; + unsigned char *d_outdata[NSTREAMS]; + float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; + half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; + cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions + cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights + cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs + cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass + cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight + cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight + cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); + + float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); + char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); + float *bp = (float *)malloc(sizeof(float)*256); + unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); + + // streams and device + cudaStream_t stream[NSTREAMS]; + for (int st=0;st d1(d_inr[st]); + thrust::fill(d1, d1+16*48*2*64*16, 0.0); + thrust::device_ptr d2(d_ini[st]); + thrust::fill(d2, d2+16*48*2*64*16, 0.0); + } + + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + int blockct = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + blockct ++; + + // write to output + /* written = ipcio_write (hdu_out2->data_block, block, block_out2); + if (written < block_out2) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + }*/ + + // DO STUFF + + // calc weights + init_weights(fnam,antpos,weights,flagants); + cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); + calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); + if (DEBUG) syslog(LOG_INFO,"Finished with weights"); + + if (started==1) { + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // run adder kernel + adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); + + // copy to host + cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); + + // copy to output + for (int j=0;j<12288*4;j++) { + if (test_pattern) + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); + else + output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; + } + if (DEBUG && bst*NSTREAMS+st==10) { + for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); + } + + } + } + + + } + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + + // calculate bandpass + + for (int i=0;i<256;i++) bp[i] = 0.; + + // do standard bf but calculate bandpass + + // loop over ints + for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); + + //if (bst==0 && st==0) + // printer<<<3072, 32>>>(d_inr,d_ini); + + // run beamformer kernel + beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); + + // copy back to host + cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); + + // calculate bandpass + //if (st==0 && bst==0) + //calc_bp(h_transfer,bp,1); + calc_bp(h_transfer + st*256*96*16,bp,0); + + } + } + + // adjust bandpass + syslog(LOG_INFO,"Final BP..."); + for (int i=0;i<256;i++) { + syslog(LOG_INFO,"coeff %d %g",i,bp[i]); + if (bp[i]!=0.) { + bp[i] /= 48.*nints; + bp[i] = 128./bp[i]/4.; + } + } + cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); + + // junk into output + memset(output_buffer,0,block_out); + + } + + // write output for debug + + // write to output + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + for (int st=0;st +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" + +#include +#include "cuda_fp16.h" +#include +#include + +// required to prevent overflow in corr matrix multiply +#define halfFac 4 + +// beam sep +#define sep 1.0 // arcmin + +/* global variables */ +int DEBUG = 1; + +// define structure that carries around device memory +typedef struct dmem { + + // initial data and streams + char * h_input; // host input pointer + char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + + // correlator pointers + // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times] + half * d_r, * d_i; + // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] + half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi; + // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] + float * d_output; + + // beamformer pointers + char * d_big_input; + half * d_br, * d_bi; + half * weights_r, * weights_i; //weights: [arm, tactp, b] + half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b] + unsigned char * d_bigpower; //output: [b, tc] + float * d_scf; // scale factor per beam + float * d_chscf; + float * h_winp; + int * flagants, nflags; + float * h_freqs, * d_freqs; + + // timing + float cp, prep, cubl, outp; + +} dmem; + + +// allocate device memory +void initialize(dmem * d, int bf) { + + // for correlator + if (bf==0) { + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + } + + // for beamformer + if (bf==1) { + cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); + cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor + cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor + + // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] + d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); + d->flagants = (int *)malloc(sizeof(int)*NANTS); + d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); + + // timers + d->cp = 0.; + d->prep = 0.; + d->outp = 0.; + d->cubl = 0.; + + } + +} + +// deallocate device memory +void deallocate(dmem * d, int bf) { + + cudaFree(d->d_input); + + if (bf==0) { + cudaFree(d->d_r); + cudaFree(d->d_i); + cudaFree(d->d_tx); + cudaFree(d->d_output); + cudaFree(d->d_outr); + cudaFree(d->d_outi); + cudaFree(d->d_tx_outr); + cudaFree(d->d_tx_outi); + } + if (bf==1) { + cudaFree(d->d_tx); + cudaFree(d->d_br); + cudaFree(d->d_bi); + cudaFree(d->weights_r); + cudaFree(d->weights_i); + cudaFree(d->d_bigbeam_r); + cudaFree(d->d_bigbeam_i); + cudaFree(d->d_bigpower); + cudaFree(d->d_scf); + cudaFree(d->d_chscf); + free(d->h_winp); + free(d->flagants); + cudaFree(d->d_freqs); + free(d->h_freqs); + } + +} + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + + +void usage() +{ +fprintf (stdout, + "dsaX_bfCorr [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -b run beamformer [default is to run correlator]\n" + " -h print usage\n" + " -t binary file for test mode\n" + " -f flagants file\n" + " -a calib file\n" + " -s start frequency (assumes -0.244140625MHz BW)\n"); +} + +// kernel to fluff input +// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks +__global__ void corr_input_copy(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 + int tidx = threadIdx.x; // assume 128 + int iidx = bidx*128+tidx; + + inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); + ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + +} + + +// arbitrary transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_char(char * idata, char * odata) { + + __shared__ char tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + +// arbitrary transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_float(half * idata, half * odata) { + + __shared__ half tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + +// arbitrary transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +template __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) { + + __shared__ in_prec tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + + +// function to copy and reorder d_input to d_r and d_i +// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] +// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] +// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. +// then fluffs using simple kernel +void reorder_input(char *input, char * tx, half *inr, half *ini) { + + // transpose input data + dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); + transpose_matrix_char<<>>(input,tx); + /* + // set up for geam + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cublasSetStream(cublasH, stream); + + // transpose input matrix into tx + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + const int m = NPACKETS_PER_BLOCK * NANTS; + const int n = NCHAN_PER_PACKET*2*2/8; // columns in output + const double alpha = 1.0; + const double beta = 0.0; + const int lda = n; + const int ldb = m; + const int ldc = ldb; + cublasDgeam(cublasH,transa,transb,m,n, + &alpha,(double *)(input), + lda,&beta,(double *)(tx), + ldb,(double *)(tx),ldc); + */ + // now we just need to fluff to half-precision + corr_input_copy<<>>(tx,inr,ini); + + // look at output + /*char * odata = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2); + cudaMemcpy(odata,inr,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,cudaMemcpyDeviceToHost); + FILE *fout; + fout=fopen("test.test","wb"); + fwrite(odata,1,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,fout); + fclose(fout);*/ + + // destroy stream + //cudaStreamDestroy(stream); + +} + +// kernel to help with reordering output +// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] +// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads +__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { + + int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); + int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); + int ch = (int)(chpol / 2); + int base_idx = indices_lookup[baseline]; + int iidx = base_idx * NCHAN_PER_PACKET + ch; + int pol = (int)(chpol % 2); + + float v1=0., v2=0.; + + for (int i=0;i>>(d->d_outr,d->d_tx_outr); + transpose_matrix_float<<>>(d->d_outi,d->d_tx_outi); + + // look at output + /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); + cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); + FILE *fout; + fout=fopen("test2.test","wb"); + fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout); + fclose(fout);*/ + + + /* + // set up for geam + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cublasSetStream(cublasH, stream); + + // transpose output matrices into tx_outr and tx_outi + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + const int m = NCHAN_PER_PACKET*2*2; + const int n = NANTS*NANTS/16; // columns in output + const double alpha = 1.0; + const double beta = 0.0; + const int lda = n; + const int ldb = m; + const int ldc = ldb; + cublasDgeam(cublasH,transa,transb,m,n, + &alpha,(double *)(d->d_outr), + lda,&beta,(double *)(d->d_tx_outr), + ldb,(double *)(d->d_tx_outr),ldc); + cublasDgeam(cublasH,transa,transb,m,n, + &alpha,(double *)(d->d_outi), + lda,&beta,(double *)(d->d_tx_outi), + ldb,(double *)(d->d_tx_outi),ldc); + */ + // now run kernel to sum into output + int * h_idxs = (int *)malloc(sizeof(int)*NBASE); + int * d_idxs; + cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE); + int ii = 0; + // upper triangular order (column major) to match xGPU (not the same as CASA!) + for (int i=0;i>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs); + + /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); + cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); + FILE *fout; + fout=fopen("test3.test","wb"); + fwrite(odata,sizeof(char),384*4*NBASE*4,fout); + fclose(fout);*/ + + + cudaFree(d_idxs); + free(h_idxs); + //cudaStreamDestroy(stream); + +} + + + +// correlator function +// workflow: copy to device, reorder, stridedBatchedGemm, reorder +void dcorrelator(dmem * d) { + + // zero out output arrays + cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + + // copy to device + cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice); + + // reorder input + reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i); + + // not sure if essential + cudaDeviceSynchronize(); + + // set up for gemm + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cublasCreate(&cublasH); + cublasSetStream(cublasH, stream); + + // gemm settings + // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] + cublasOperation_t transa = CUBLAS_OP_N; + cublasOperation_t transb = CUBLAS_OP_T; + const int m = NANTS; + const int n = NANTS; + const int k = NPACKETS_PER_BLOCK/halfFac; + const half alpha = 1.; + const half malpha = -1.; + const int lda = m; + const int ldb = n; + const half beta0 = 0.; + const half beta1 = 1.; + const int ldc = m; + const long long int strideA = NPACKETS_PER_BLOCK*NANTS/halfFac; + const long long int strideB = NPACKETS_PER_BLOCK*NANTS/halfFac; + const long long int strideC = NANTS*NANTS; + const int batchCount = NCHAN_PER_PACKET*2*2*halfFac; + + // run strided batched gemm + // ac + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_r,lda,strideA, + d->d_r,ldb,strideB,&beta0, + d->d_outr,ldc,strideC, + batchCount); + // bd + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_i,lda,strideA, + d->d_i,ldb,strideB,&beta1, + d->d_outr,ldc,strideC, + batchCount); + // -bc + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &malpha,d->d_i,lda,strideA, + d->d_r,ldb,strideB,&beta0, + d->d_outi,ldc,strideC, + batchCount); + // ad + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_r,lda,strideA, + d->d_i,ldb,strideB,&beta1, + d->d_outi,ldc,strideC, + batchCount); + + // shown to be essential + cudaDeviceSynchronize(); + + // destroy stream + cudaStreamDestroy(stream); + cublasDestroy(cublasH); + + // reorder output data + reorder_output(d); + +} + +// kernels to reorder and fluff input data for beamformer +// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] +// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex] // run as 16x16 tiled transpose with 32-byte words +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16) +// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index +// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); +__global__ void transpose_input_bf(double * idata, double * odata) { + + __shared__ double tile[16][17][4]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + + for (int j = 0; j < 16; j += 8) { + tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)]; + tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1]; + tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2]; + tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3]; + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) { + odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0]; + odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1]; + odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2]; + odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3]; + } + +} + +// kernel to fluff input bf data +// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads +__global__ void fluff_input_bf(char * input, half * dr, half * di) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); + di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + +} + +// transpose, add and scale kernel for bf +// assume breakdown into tiles of 16x16, and run with 16x8 threads per block +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) +// scf is a per-beam scale factor to enable recasting as unsigned char +__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) { + + __shared__ float tile[16][17]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + float dr, di; + + for (int j = 0; j < 16; j += 8) { + dr = (float)(ir[(y+j)*width + x]); + di = (float)(ii[(y+j)*width + x]); + tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) + odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + +} + +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +__global__ void sum_beam(unsigned char * input, float * output) { + + __shared__ float summ[512]; + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int idx = bidx*256+tidx; + int bm = (int)(bidx/48); + int ch = (int)(bidx % 48); + + summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); + + __syncthreads(); + + if (tidx<256) { + summ[tidx] += summ[tidx+256]; + summ[tidx] += summ[tidx+128]; + summ[tidx] += summ[tidx+64]; + summ[tidx] += summ[tidx+32]; + summ[tidx] += summ[tidx+16]; + summ[tidx] += summ[tidx+8]; + summ[tidx] += summ[tidx+4]; + summ[tidx] += summ[tidx+2]; + summ[tidx] += summ[tidx+1]; + } + + if (tidx==0) output[bidx] = summ[tidx]; + +} + +/* +Beamformer: + - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] +(single transpose operation) + - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 + - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) + - transpose and done! + +*/ +// beamformer function +void dbeamformer(dmem * d) { + + // gemm settings - recall column major order assumed + // stride over 48 chans + cublasHandle_t cublasH = NULL; + cublasCreate(&cublasH); + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + const int m = NPACKETS_PER_BLOCK/4; + const int n = NBEAMS/2; + const int k = 4*(NANTS/2)*8*2*2; + const half alpha = 1.; + const half malpha = -1.; + const int lda = k; + const int ldb = k; + const half beta0 = 0.; + const half beta1 = 1.; + const int ldc = m; + const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; + const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2; + const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; + const int batchCount = NCHAN_PER_PACKET/8; + long long int i1, i2, o1; + + // create streams + cudaStream_t stream; + cudaStreamCreate(&stream); + + // timing + // copy, prepare, cublas, output + clock_t begin, end; + + // do big memcpy + begin = clock(); + cudaMemcpy(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4,cudaMemcpyHostToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // loop over halves of the array + for (int iArm=0;iArm<2;iArm++) { + + // zero out output arrays + cudaMemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + cudaMemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + cudaDeviceSynchronize(); + + // copy data to device + // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + // final data: need to split by NANTS. + begin = clock(); + for (i1=0;i1d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,(NANTS/2)*NCHAN_PER_PACKET*4,cudaMemcpyDeviceToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // do reorder and fluff of data to real and imag + begin = clock(); + dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); + transpose_input_bf<<>>((double *)(d->d_input),(double *)(d->d_tx)); + fluff_input_bf<<>>(d->d_tx,d->d_br,d->d_bi); + end = clock(); + d->prep += (float)(end - begin) / CLOCKS_PER_SEC; + + // large matrix multiply to get real and imag outputs + // set up for gemm + cublasSetStream(cublasH, stream); + i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset + + // run strided batched gemm + begin = clock(); + // ac + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_br,lda,strideA, + d->weights_r+i2,ldb,strideB,&beta0, + d->d_bigbeam_r,ldc,strideC, + batchCount); + // -bd + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &malpha,d->d_bi,lda,strideA, + d->weights_i+i2,ldb,strideB,&beta1, + d->d_bigbeam_r,ldc,strideC, + batchCount); + // bc + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_bi,lda,strideA, + d->weights_r+i2,ldb,strideB,&beta0, + d->d_bigbeam_i,ldc,strideC, + batchCount); + // ad + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_br,lda,strideA, + d->weights_i+i2,ldb,strideB,&beta1, + d->d_bigbeam_i,ldc,strideC, + batchCount); + + cudaDeviceSynchronize(); + end = clock(); + d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; + + + // simple formation of total power and scaling to 8-bit in transpose kernel + begin = clock(); + dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16); + transpose_scale_bf<<>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + end = clock(); + d->outp += (float)(end - begin) / CLOCKS_PER_SEC; + + + } + + cudaStreamDestroy(stream); + + + cublasDestroy(cublasH); + + // form sum over times + //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf); + +} + +// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] +// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads +__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { + + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int inidx = bidx*128+tidx; + + // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) + + // get indices + int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2))); + int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); + int bm = (int)(idx / (128*(NANTS/2))); + int tactp = (int)(idx % (128*(NANTS/2))); + int t = (int)(tactp / (32*(NANTS/2))); + int actp = (int)(tactp % (32*(NANTS/2))); + int a = (int)(actp / 32); + int ctp = (int)(actp % 32); + int c = (int)(ctp / 4); + int tp = (int)(ctp % 4); + int t2 = (int)(tp / 2); + int pol = (int)(tp % 2); + int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; + + // calculate weights + float theta, afac, twr, twi; + if (iArm==0) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_e[a+48*iArm]); + twi = sin(afac*antpos_e[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } + if (iArm==1) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_n[a+48*iArm]); + twi = sin(afac*antpos_n[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } + +} + +// GPU-powered function to populate weights matrix for beamformer +// file format: +// sequential pairs of eastings and northings +// then [NANTS, 48, R/I] calibs + +void calc_weights(dmem * d) { + + // allocate + float *antpos_e = (float *)malloc(sizeof(float)*NANTS); + float *antpos_n = (float *)malloc(sizeof(float)*NANTS); + float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); + float *d_antpos_e, *d_antpos_n, *d_calibs; + float wnorm; + cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS); + cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS); + cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); + + // deal with antpos and calibs + int iant, found; + for (int i=0;ih_winp[2*i]; + antpos_n[i] = d->h_winp[2*i+1]; + } + for (int i=0;inflags;j++) + if (d->flagants[j]==iant) found = 1; + + calibs[2*i] = d->h_winp[2*NANTS+2*i]; + calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1]; + + wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]); + if (wnorm!=0.0) { + calibs[2*i] /= wnorm; + calibs[2*i+1] /= wnorm; + } + + //if (found==1) { + //calibs[2*i] = 0.; + //calibs[2*i+1] = 0.; + //} + } + + //for (int i=0;i>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs); + + // free stuff + cudaFree(d_antpos_e); + cudaFree(d_antpos_n); + cudaFree(d_calibs); + free(antpos_e); + free(antpos_n); + free(calibs); + +} + +// MAIN + +int main (int argc, char *argv[]) { + + cudaSetDevice(1); + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = REORDER_BLOCK_KEY; + key_t out_key = XGPU_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg = 0; + int bf = 0; + int test = 0; + char ftest[200], fflagants[200], fcalib[200]; + float sfreq = 1498.75; + + + while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + test = 1; + syslog(LOG_INFO, "test mode"); + if (sscanf (optarg, "%s", &ftest) != 1) { + syslog(LOG_ERR, "could not read test file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'a': + if (optarg) + { + syslog(LOG_INFO, "read calib file %s",optarg); + if (sscanf (optarg, "%s", &fcalib) != 1) { + syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-a flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + syslog(LOG_INFO, "reading flag ants file %s",optarg); + if (sscanf (optarg, "%s", &fflagants) != 1) { + syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 's': + if (optarg) + { + sfreq = atof(optarg); + syslog(LOG_INFO, "start freq %g",sfreq); + break; + } + else + { + syslog(LOG_ERR,"-s flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'b': + bf=1; + syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // allocate device memory + dmem d; + initialize(&d,bf); + + // set up for beamformer + FILE *ff; + int iii; + if (bf) { + + if (!(ff=fopen(fflagants,"r"))) { + syslog(LOG_ERR,"could not open flagants file\n"); + exit(1); + } + d.nflags=0; + while (!feof(ff)) { + fscanf(ff,"%d\n",&d.flagants[iii]); + d.nflags++; + } + fclose(ff); + + if (!(ff=fopen(fcalib,"rb"))) { + syslog(LOG_ERR,"could not open calibss file\n"); + exit(1); + } + fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); + fclose(ff); + + for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) + d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); + cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); + + // calculate weights + calc_weights(&d); + + } + + // test mode + FILE *fin, *fout; + uint64_t output_size; + char * output_data, * o1; + if (test) { + + // read one block of input data + d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + for (int i=0;i<512;i++) { + fin = fopen(ftest,"rb"); + fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); + fclose(fin); + } + + // run correlator or beamformer, and output data + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + + fout = fopen("output.dat","wb"); + fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); + fclose(fout); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); + + /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); + o1 = (char *)malloc(output_size); + cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/ + + + + fout = fopen("output.dat","wb"); + fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); + //fwrite(o1,1,output_size,fout); + fclose(fout); + } + + + // free + free(d.h_input); + free(output_data); + free(o1); + deallocate(&d,bf); + + exit(1); + } + + + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + if (bf==0) + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); + else + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); + uint64_t bytes_read = 0; + char * block; + char * output_buffer; + output_buffer = (char *)malloc(block_out); + uint64_t written, block_id; + + // get things started + bool observation_complete=0; + bool started = 0; + syslog(LOG_INFO, "starting observation"); + int blocks = 0; + clock_t begin, end; + double time_spent; + + while (!observation_complete) { + + if (DEBUG) syslog(LOG_INFO,"reading block"); + + // open block + d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // do stuff + //begin = clock(); + // loop + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); + } + //end = clock(); + //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; + cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; + + // write to output + + // write to host + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + // loop end + + + // finish up + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + // finish up + free(output_buffer); + deallocate(&d,bf); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_bigfake.c b/legacy/dsaX_bigfake.c new file mode 100644 index 0000000..f5e1354 --- /dev/null +++ b/legacy/dsaX_bigfake.c @@ -0,0 +1,320 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -f file to read packet from [default none]\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + useZ = 0; + strcpy(fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + uint64_t npackets = block_out / 4194304; + char * block, * output_buffer; + char * packet; + packet = (char *)malloc(sizeof(char)*4194304); + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // fill output buffer if file exists + FILE *fin; + if (!useZ) { + + if (!(fin=fopen(fnam,"rb"))) { + syslog(LOG_ERR, "cannot open file - will write zeros"); + } + else { + + fread(packet,4194304,1,fin); + fclose(fin); + + syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + + for (int i=0;idata_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + // no need to do anything here - output_buffer is ready to go + + // write to output + written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(packet); + free(output_buffer); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_capture.c b/legacy/dsaX_capture.c new file mode 100644 index 0000000..054e45d --- /dev/null +++ b/legacy/dsaX_capture.c @@ -0,0 +1,1080 @@ +/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer. + +1;95;0cmain: runs capture loop, and interfaces dada buffer +control_thread: deals with control commands + +*/ + +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" +//#include "multilog.h" + +#define unhappies 3000 +#define skips 6 +#define sleeps 1.5 + +/* global variables */ +int quit_threads = 0; +char STATE[20]; +uint64_t UTC_START = 10000; +uint64_t UTC_STOP = 40000000000; +int MONITOR = 0; +char iP[100]; +int DEBUG = 0; +int HISTOGRAM[16]; +int cPort = CAPTURE_CONTROL_PORT; +int dPort = CAPTURE_PORT; + +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + + + +} + +void usage() +{ + fprintf (stdout, + "dsaX_capture [options]\n" + " -c core bind process to CPU core [no default]\n" + " -j IP to listen on for data packets [no default]\n" + " -p PORT to listen to for data packets [default 4011]\n" + " -q PORT to listen to for control commands [default CAPTURE_CONTROL_PORT]\n" + " -i IP to listen on for control commands [no default]\n" + " -f filename of template dada header [no default]\n" + " -o out_key [default CAPTURE_BLOCK_KEY]\n" + " -d send debug messages to syslog\n" + " -h print usage\n"); +} + +/* + * create a socket with the specified number of buffers + */ +dsaX_sock_t * dsaX_init_sock () +{ + dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); + assert(b != NULL); + + b->bufsz = sizeof(char) * UDP_PAYLOAD; + + b->buf = (char *) malloc (b->bufsz); + assert(b->buf != NULL); + + b->have_packet = 0; + b->fd = 0; + + return b; +} + +void dsaX_free_sock(dsaX_sock_t* b) +{ + b->fd = 0; + b->bufsz = 0; + b->have_packet =0; + if (b->buf) + free (b->buf); + b->buf = 0; +} + +/* + * intialize UDP receiver resources + */ +int dsaX_udpdb_init_receiver (udpdb_t * ctx) +{ + syslog(LOG_INFO,"dsax_udpdb_init_receiver()"); + + // create a dsaX socket which can hold variable num of UDP packet + ctx->sock = dsaX_init_sock(); + + ctx->ooo_packets = 0; + ctx->recv_core = -1; + ctx->n_sleeps = 0; + ctx->mb_rcv_ps = 0; + ctx->mb_drp_ps = 0; + ctx->block_open = 0; + ctx->block_count = 0; + ctx->capture_started = 0; + ctx->last_seq = 0; + ctx->last_byte = 0; + ctx->block_start_byte = 0; + + // allocate required memory strucutres + ctx->packets = init_stats_t(); + ctx->bytes = init_stats_t(); + + syslog(LOG_INFO,"receiver inited"); + + return 0; +} + +/* +prepare socket and writer +*/ + +int dsaX_udpdb_prepare (udpdb_t * ctx) +{ + syslog(LOG_INFO, "dsaX_udpdb_prepare()"); + + // open socket + syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); + ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose); + if (ctx->sock->fd < 0) { + syslog (LOG_ERR, "Error, Failed to create udp socket"); + return -1; + } + + + // set the socket size to 256 MB + int sock_buf_size = 4*1024*1024; + syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); + dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size); + + // set the socket to non-blocking + syslog(LOG_INFO, "prepare: setting non_block"); + sock_nonblock(ctx->sock->fd); + + // clear any packets buffered by the kernel + syslog(LOG_INFO, "prepare: clearing packets at socket"); + size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD); + + // setup the next_seq to the initial value + //ctx->last_seq = 0; + //ctx->last_byte = 0; + //ctx->n_sleeps = 0; + + return 0; +} + +/* + * reset receiver before an observation commences + */ +void dsaX_udpdb_reset_receiver (udpdb_t * ctx) +{ + syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()"); + + ctx->capture_started = 0; + ctx->last_seq = 0; + ctx->last_byte = 0; + ctx->n_sleeps = 0; + + reset_stats_t(ctx->packets); + reset_stats_t(ctx->bytes); +} + +/* + * open a data block buffer ready for direct access + */ +int dsaX_udpdb_open_buffer (udpdb_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); + + if (ctx->block_open) + { + syslog (LOG_ERR, "open_buffer: buffer already opened"); + return -1; + } + + if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); + + uint64_t block_id = 0; + + ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id); + if (!ctx->block) + { + syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); + return -1; + } + + ctx->block_open = 1; + ctx->block_count = 0; + + return 0; +} + +/* + * close a data buffer, assuming a full block has been written + */ +int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); + + if (!ctx->block_open) + { + syslog (LOG_ERR, "close_buffer: buffer already closed"); + return -1; + } + + // log any buffers that are not full, except for the 1 byte "EOD" buffer + if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) + syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " + "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", + bytes_written, ctx->hdu_bufsz); + + if (eod) + { + if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); + return -1; + } + } + else + { + if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); + return -1; + } + } + + ctx->block = 0; + ctx->block_open = 0; + + return 0; +} + +/* + * move to the next ring buffer element. return pointer to base address of new buffer + */ +int dsaX_udpdb_new_buffer (udpdb_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); + + if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); + return -1; + } + + if (dsaX_udpdb_open_buffer (ctx) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); + return -1; + } + + // increment buffer byte markers + ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; + ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; + + // set block to 0 + //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte); + + if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", + ctx->block_start_byte, ctx->block_end_byte); + + return 0; + +} + +/* + * destroy UDP receiver resources + */ +int dsaX_udpdb_destroy_receiver (udpdb_t * ctx) +{ + if (ctx->sock) + dsaX_free_sock(ctx->sock); + ctx->sock = 0; +} + +/* + * Close the udp socket and file + */ + +int udpdb_stop_function (udpdb_t* ctx) +{ + + syslog(LOG_INFO, "stop: dada_hdu_unlock_write()"); + if (dada_hdu_unlock_write (ctx->hdu) < 0) + { + syslog (LOG_ERR, "stop: could not unlock write on"); + return -1; + } + + // close the UDP socket + close(ctx->sock->fd); + + if (ctx->packets->dropped) + { + double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte; + percent *= 100; + + syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %", + ctx->bytes->dropped, ctx->last_byte, percent); + } + + return 0; +} + + + + +/* --------- THREADS -------- */ + +// STATS THREAD + +/* + * Thread to print simple capture statistics + */ +void stats_thread(void * arg) { + + /* // set affinity + const pthread_t pid = pthread_self(); + const int core_id = 4; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + */ + + udpdb_t * ctx = (udpdb_t *) arg; + uint64_t b_rcv_total = 0; + uint64_t b_rcv_1sec = 0; + uint64_t b_rcv_curr = 0; + + uint64_t b_drp_total = 0; + uint64_t b_drp_1sec = 0; + uint64_t b_drp_curr = 0; + + uint64_t s_rcv_total = 0; + uint64_t s_rcv_1sec = 0; + uint64_t s_rcv_curr = 0; + + uint64_t ooo_pkts = 0; + float gb_rcv_ps = 0; + float mb_rcv_ps = 0; + float mb_drp_ps = 0; + + syslog(LOG_INFO,"stats_thread: starting loop"); + + while (!quit_threads) + { + + /* get a snapshot of the data as quickly as possible */ + b_rcv_curr = ctx->bytes->received; + b_drp_curr = ctx->bytes->dropped; + s_rcv_curr = ctx->n_sleeps; + + /* calc the values for the last second */ + b_rcv_1sec = b_rcv_curr - b_rcv_total; + b_drp_1sec = b_drp_curr - b_drp_total; + s_rcv_1sec = s_rcv_curr - s_rcv_total; + + /* update the totals */ + b_rcv_total = b_rcv_curr; + b_drp_total = b_drp_curr; + s_rcv_total = s_rcv_curr; + + mb_rcv_ps = (double) b_rcv_1sec / 1000000; + mb_drp_ps = (double) b_drp_1sec / 1000000; + gb_rcv_ps = b_rcv_1sec * 8; + gb_rcv_ps /= 1000000000; + + /* determine how much memory is free in the receivers */ + syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped 0", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq); + + sleep(1); + } + +} + + + + + + + +// CONTROL THREAD + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = cPort; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + + // INTERPRET BUFFER STRING + // receive either UTC_START, UTC_STOP, MONITOR + + // interpret buffer string + char * rest = buffer; + char *cmd, *val; + cmd = strtok_r(rest, "-", &rest); + val = strtok_r(rest, "-", &rest); + syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); + + if (strcmp(cmd,"UTC_START")==0) + UTC_START = strtoull(val,&endptr,0); + + if (strcmp(cmd,"UTC_STOP")==0) + UTC_STOP = strtoull(val,&endptr,0); + + close(fd); + + } + + free (buffer); + + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + +// MAIN of program + +int main (int argc, char *argv[]) { + + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_capture", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit for writing */ + dada_hdu_t* hdu_out = 0; + + /* actual struct with info */ + udpdb_t udpdb; + + // input data block HDU key + key_t out_key = CAPTURE_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg=0; + char dada_fnam[200]; // filename for dada header + char iface[100]; // IP for data packets + + while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1) + { + switch (arg) + { + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + strcpy(iP,optarg); + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'p': + if (optarg) + { + dPort = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-p flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'q': + if (optarg) + { + cPort = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-q flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'j': + if (optarg) + { + strcpy(iface,optarg); + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + strcpy(dada_fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // record STATE info + sprintf(STATE,"NOBUFFER"); + + // START THREADS + + // start control thread + int rval = 0; + pthread_t control_thread_id, stats_thread_id; + if (DEBUG) + syslog (LOG_DEBUG, "Creating threads"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort); + + // start the stats thread + rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "started stats_thread()"); + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // initialize the data structure + syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()"); + if (dsaX_udpdb_init_receiver (&udpdb) < 0) + { + syslog (LOG_ERR, "could not initialize receiver"); + return EXIT_FAILURE; + } + + + // OPEN CONNECTION TO DADA DB FOR WRITING + + if (DEBUG) syslog(LOG_INFO,"Creating HDU"); + + hdu_out = dada_hdu_create (0); + if (DEBUG) syslog(LOG_INFO,"Created hdu"); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog(LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (DEBUG) syslog(LOG_INFO,"Connected HDU"); + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_out); + syslog(LOG_ERR,"could not lock to output dada buffer"); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"opened connection to output DB"); + + // DEAL WITH DADA HEADER + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + if (DEBUG) syslog(LOG_INFO,"read header2"); + + if (fileread (dada_fnam, hout, 4096) < 0) + { + free (hout); + syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); + return (EXIT_FAILURE); + } + + + if (DEBUG) syslog(LOG_INFO,"read header3"); + + + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + + + // copy the in header to the out header + memcpy (header_out, hout, 4096); + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) + { + syslog(LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + sprintf(STATE,"LISTEN"); + syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); + + + /* time to start up receiver. + data are captured on iface:CAPTURE_PORT + */ + + + // put information in udpdb struct + udpdb.hdu = hdu_out; + udpdb.port = dPort; + udpdb.interface = strdup(iface); + udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + // determine number of packets per block, must + if (udpdb.hdu_bufsz % UDP_DATA != 0) + { + syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA); + return EXIT_FAILURE; + } + udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA; + udpdb.bytes_to_acquire = 0; + udpdb.num_inputs = NSNAPS; + + // prepare the socket + syslog(LOG_INFO, "main: dsaX_udpdb_prepare()"); + if (dsaX_udpdb_prepare (&udpdb) < 0) + { + syslog(LOG_ERR, "could allocate required resources (prepare)"); + return EXIT_FAILURE; + } + + // reset the receiver + syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()"); + dsaX_udpdb_reset_receiver (&udpdb); + + // open a block of the data block, ready for writing + if (dsaX_udpdb_open_buffer (&udpdb) < 0) + { + syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed"); + return -1; + } + + /* START WHAT WAS in RECV THREAD */ + + // DEFINITIONS + + // lookup table for ant order + uint64_t ant_lookup[100], vv; + for (int i=0;i<100;i++) ant_lookup[i] = 0; + for (int i=0;ibuf; + size_t got = 0; // data received from a recv_from call + int errsv; // determine the sequence number boundaries for curr and next buffers + int64_t byte_offset = 0; // offset of current packet in bytes from start of block + uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs + // for "saving" out of order packets near edges of blocks + unsigned int temp_idx = 0; + unsigned int temp_max = 1000; + char ** temp_buffers; //[temp_max][UDP_DATA]; + uint64_t * temp_seq_byte; + temp_buffers = (char **)malloc(sizeof(char *)*temp_max); + for (int i=0;ihave_packet = 0; + + // incredibly tight loop to try and get a packet + while (!udpdb.sock->have_packet) + { + + // receive 1 packet into the socket buffer + got = recvfrom ( udpdb.sock->fd, udpdb.sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); + + if (got == UDP_PAYLOAD) + { + udpdb.sock->have_packet = 1; + } + else if (got == -1) + { + errsv = errno; + if (errsv == EAGAIN) + { + udpdb.n_sleeps++; + if (udpdb.capture_started) + timeouts++; + if (timeouts > timeout_max) + syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); + } + else + { + syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); + return EXIT_FAILURE; + } + } + else // we received a packet of the WRONG size, ignore it + { + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); + } + } + timeouts = 0; + + // we have a valid packet within the timeout + if (udpdb.sock->have_packet) + { + + // decode packet header (64 bits) + // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) + seq_no = 0; + seq_no |= (((uint64_t)(udpdb.sock->buf[4]) & 224) >> 5) & 7; + //seq_no &= 7; + seq_no |= (((uint64_t)(udpdb.sock->buf[3])) << 3) & 2040; + //seq_no &= 2047; + seq_no |= (((uint64_t)(udpdb.sock->buf[2])) << 11) & 522240; + //seq_no &= 524287; + seq_no |= (((uint64_t)(udpdb.sock->buf[1])) << 19) & 133693440; + //seq_no &= 134217727; + seq_no |= (((uint64_t)(udpdb.sock->buf[0])) << 27) & 34225520640; + //seq_no &= 34359738367; + /*seq_no = 0; + seq_no |= 224 >> 5; + seq_no |= 255 << 3; + seq_no |= 255 << 11; + seq_no |= 255 << 19;*/ + + /*ch_id = 0; + ch_id |= ((unsigned char) (udpdb.sock->buf[4]) & 31) << 8; + ch_id |= (unsigned char) (udpdb.sock->buf[5]);*/ + + ant_id = 0; + ant_id |= (unsigned char) (udpdb.sock->buf[6]) << 8; + ant_id |= (unsigned char) (udpdb.sock->buf[7]); + aid = ant_lookup[(int)(ant_id)]; + + if (UTC_START==0) UTC_START = seq_no + 10000; + + //act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3 + (ch_id-CHOFF)/384; // actual seq no + act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no + block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block + + // check for starting or stopping condition, using continue + //if (DEBUG) printf("%"PRIu64" %"PRIu64" %d\n",seq_no,act_seq_no,ch_id);//syslog(LOG_DEBUG, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id); + //if (seq_no == UTC_START && UTC_START != 10000 && ant_id == 0) canWrite=1; + if (canWrite==0) { + if (seq_no >= UTC_START-5 && UTC_START != 10000) ct_snaps++; + if (ct_snaps >= 32) canWrite=1; + } + //if (seq_no > UTC_START && UTC_START != 10000) canWrite=1; + udpdb.last_seq = seq_no; + //syslog(LOG_INFO,"SEQ_NO_DBG %"PRIu64"",seq_no); + if (canWrite == 0) continue; + //if (seq_no == UTC_STOP) canWrite=0; + //if (udpdb.packets->received<100) syslog(LOG_INFO, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id); + + // if first packet + if (!udpdb.capture_started) + { + //udpdb.block_start_byte = act_seq_no * UDP_DATA; + udpdb.block_start_byte = block_seq_no * UDP_DATA; + udpdb.block_end_byte = (udpdb.block_start_byte + udpdb.hdu_bufsz) - UDP_DATA; + udpdb.capture_started = 1; + + syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb.block_start_byte, udpdb.block_end_byte); + } + + // if capture running + if (udpdb.capture_started) + { + seq_byte = (act_seq_no * UDP_DATA); + + udpdb.last_byte = seq_byte; + + // if packet arrived too late, ignore + if (seq_byte < udpdb.block_start_byte) + { + //syslog (LOG_INFO, "receive_obs: seq_byte < block_start_byte: %"PRIu64", %"PRIu64"", seq_no, ant_id); + udpdb.packets->dropped++; + udpdb.bytes->dropped += UDP_DATA; + } + else + { + // packet belongs in this block + if (seq_byte <= udpdb.block_end_byte) + { + byte_offset = seq_byte - udpdb.block_start_byte; + memcpy (udpdb.block + byte_offset, udpdb.sock->buf + UDP_HEADER, UDP_DATA); + udpdb.packets->received++; + udpdb.bytes->received += UDP_DATA; + udpdb.block_count++; + } + // packet belongs in subsequent block + else + { + //syslog (LOG_INFO, "receive_obs: received packet for subsequent buffer: temp_idx=%d, ant_id=%d, seq_no=%"PRIu64"",temp_idx,ant_id,seq_no); + + if (temp_idx < temp_max) + { + // save packet to temp buffer + memcpy (temp_buffers[temp_idx], udpdb.sock->buf + UDP_HEADER, UDP_DATA); + temp_seq_byte[temp_idx] = seq_byte; + temp_idx++; + } + else + { + udpdb.packets->dropped++; + udpdb.bytes->dropped += UDP_DATA; + } + } + } + } + + // now check for a full buffer or full temp queue + if ((udpdb.block_count >= udpdb.packets_per_buffer) || (temp_idx >= temp_max)) + { + syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " + "ant_id=%"PRIu16", block_count=%"PRIu64", " + "temp_idx=%d\n", seq_no, ant_id, udpdb.block_count, + temp_idx); + + uint64_t dropped = udpdb.packets_per_buffer - udpdb.block_count; + if (dropped) + { + udpdb.packets->dropped += dropped; + udpdb.bytes->dropped += (dropped * UDP_DATA); + } + + if (dropped>1000) unhappies_ct++; + + // get a new buffer and write any temp packets saved + if (dsaX_udpdb_new_buffer (&udpdb) < 0) + { + syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb.block_start_byte, udpdb.block_end_byte); + + // include any futuristic packets we saved + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - udpdb.block_start_byte; + if (byte_offset < udpdb.hdu_bufsz) + { + memcpy (udpdb.block + byte_offset, temp_buffers[i], UDP_DATA); + udpdb.block_count++; + udpdb.packets->received++; + udpdb.bytes->received += UDP_DATA; + } + else + { + udpdb.packets->dropped++; + udpdb.bytes->dropped += UDP_DATA; + } + } + temp_idx = 0; + } + } + + // packet has been inserted or saved by this point + udpdb.sock->have_packet = 0; + + // deal with unhappy receiver + if (unhappies_ct > unhappies) { + + syslog(LOG_INFO, "Skipping some blocks..."); + + close(udpdb.sock->fd); + + for (int i=0;idropped += udpdb.packets_per_buffer; + udpdb.bytes->dropped += (udpdb.packets_per_buffer * UDP_DATA); + + if (dsaX_udpdb_new_buffer (&udpdb) < 0) + { + syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); + return EXIT_FAILURE; + } + + } + + sleep(sleeps); + + // prepare the socket + syslog(LOG_INFO, "re-preparing the socket dsaX_udpdb_prepare()"); + if (dsaX_udpdb_prepare (&udpdb) < 0) + { + syslog(LOG_ERR, "could allocate required resources (prepare)"); + return EXIT_FAILURE; + } + + unhappies_ct = 0; + + } + + } + + /* END WHAT WAS IN RECV THREAD */ + + + // close threads + syslog(LOG_INFO, "joining control_thread and stats_thread"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + pthread_join (stats_thread_id, &result); + + free(temp_seq_byte); + free(temp_buffers); + + dsaX_dbgpu_cleanup (hdu_out); + +} diff --git a/legacy/dsaX_capture.h b/legacy/dsaX_capture.h new file mode 100644 index 0000000..58355f8 --- /dev/null +++ b/legacy/dsaX_capture.h @@ -0,0 +1,131 @@ +/*************************************************************************** + * + * Copyright (C) 2009 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ****************************************************************************/ + +#ifndef __DSAX_UDPDB_THREAD_H +#define __DSAX_UDPDB_THREAD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +#define DSAX_UDPDB_BUF_CLEAR = 0 +#define DSAX_UDPDB_BUF_FULL = 1 + +/* socket buffer for receiving udp data */ +typedef struct { + + int fd; // FD of the socket + size_t bufsz; // size of socket buffer + char * buf; // the socket buffer + int have_packet; // + size_t got; // amount of data received + +} dsaX_sock_t; + +dsaX_sock_t * dsax_Xnit_sock (); + +void dsaX_free_sock(dsaX_sock_t* b); + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + multilog_t * log; // DADA logging interface + int verbose; // verbosity flag + + dsaX_sock_t * sock; // UDP socket for data capture + int port; // port to receive UDP data + int control_port; // port to receive control commands + char * interface; // IP Address to accept packets on + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // datablock management + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * block; // pointer to current datablock buffer + uint64_t block_start_byte; // seq_byte of first byte for the block + uint64_t block_end_byte; // seq_byte of first byte of final packet of the block + uint64_t block_count; // number of packets in this block + char * tblock; // area of memory to write to + + // packets + unsigned capture_started; // flag for start of UDP data + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + + uint64_t bytes_to_acquire; + double mb_rcv_ps; + double mb_drp_ps; + double mb_free; + double mb_total; + uint64_t rcv_sleeps; + + uint64_t last_seq; // most recently received seq number + uint64_t last_byte; // most recently received byte + struct timeval timeout; + + uint64_t n_sleeps; + uint64_t ooo_packets; + + int recv_core; + +} udpdb_t; + + +int dsaX_udpdb_init_receiver (udpdb_t * ctx); +void dsaX_udpdb_reset_receiver (udpdb_t * ctx); +int dsaX_udpdb_destroy_receiver (udpdb_t * ctx); +int dsaX_udpdb_open_buffer (udpdb_t * ctx); +int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod); +int dsaX_udpdb_new_buffer (udpdb_t * ctx); +int dsaX_udpdb_increment (udpdb_t * ctx); + +// allocate required resources for data capture +int dsaX_udpdb_prepare (udpdb_t * ctx); + +// move to a state where data acquisition can begin +time_t dsaX_dpdb_start (udpdb_t * ctx, char * header); + +// main workhorse function to receive data for a single observation +void * dsaX_udpdb_receive_obs (void * ctx); + +// close the datablock signifying end of data +int udpdb_stop_function (udpdb_t* ctx); + +void usage(); +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); + +#endif diff --git a/legacy/dsaX_capture_manythread.c b/legacy/dsaX_capture_manythread.c new file mode 100644 index 0000000..b9f14bd --- /dev/null +++ b/legacy/dsaX_capture_manythread.c @@ -0,0 +1,1115 @@ +/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer. + +main: runs capture loop, and interfaces dada buffer +control_thread: deals with control commands + +*/ + +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture_manythread.h" +#include "dsaX_def.h" + +/* global variables */ +int dPort, cPort; +int quit_threads = 0; +char STATE[20]; +uint64_t UTC_START = 10000; +uint64_t UTC_STOP = 40000000000; +int MONITOR = 0; +char iP[100]; +int DEBUG = 0; +int HISTOGRAM[16]; +int writeBlock = 0; +const int nth = 4; +const int nwth = 2; +int cores[16] = {10,12,11,13,30,31,32,33}; +int write_cores[8] = {14,15,34,35}; +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +volatile int blockStatus[64]; +volatile int skipBlock = 0; +volatile int skipping = 0; +volatile int lWriteBlock = 0; +volatile int write_ct = 0; +volatile uint64_t last_seq = 0; +volatile int skipct = 0; +volatile uint64_t block_count = 0; +volatile uint64_t block_start_byte=0, block_end_byte=0; +volatile unsigned capture_started = 0; +volatile char * wblock; + +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); +void usage(); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + + + +} + +void usage() +{ + fprintf (stdout, + "dsaX_capture [options]\n" + " -c core bind process to CPU core [no default]\n" + " -j IP to listen on for data packets [no default]\n" + " -i IP to listen on for control commands [no default]\n" + " -p PORT for data\n" + " -q PORT for control\n" + " -f filename of template dada header [no default]\n" + " -o out_key [default CAPTURE_BLOCK_KEY]\n" + " -d send debug messages to syslog\n" + " -g chgroup [default 0]\n" + " -h print usage\n"); +} + +// open a socket +dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx); +dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx) +{ + + // prepare structure + syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure"); + dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); + assert(b != NULL); + b->bufsz = sizeof(char) * UDP_PAYLOAD; + b->buf = (char *) malloc (b->bufsz); + assert(b->buf != NULL); + b->have_packet = 0; + b->fd = 0; + + // connect to socket + syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, dPort); + + // open socket + syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, dPort); + b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + assert(b->fd>=0); + + // for multiple connections + int one = 1; + setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one)); + + struct sockaddr_in udp_sock; + bzero(&(udp_sock.sin_zero), 8); // clear the struct + udp_sock.sin_family = AF_INET; // internet/IP + udp_sock.sin_port = htons(dPort); // set the port number + udp_sock.sin_addr.s_addr = inet_addr(ctx->interface); // from a specific IP address + + if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) { + syslog(LOG_ERR, "prepare: failed to bind to socket"); + return -1; + } + + // set the socket size to 64 MB + int sock_buf_size = 64*1024*1024; + syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); + dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size); + + // set the socket to non-blocking + syslog(LOG_INFO, "prepare: setting non_block"); + sock_nonblock(b->fd); + + // clear any packets buffered by the kernel + syslog(LOG_INFO, "prepare: clearing packets at socket"); + size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD); + + // clear blockStatus + for (int i=0;i<64;i++) blockStatus[i] = 0; + + return b; +} + + + +// close a socket +void dsaX_free_sock(dsaX_sock_t* b); +void dsaX_free_sock(dsaX_sock_t* b) +{ + b->fd = 0; + b->bufsz = 0; + b->have_packet =0; + if (b->buf) + free (b->buf); + b->buf = 0; +} + +/* + * open a data block buffer ready for direct access + */ +int dsaX_udpdb_open_buffer (dsaX_write_t * ctx); +int dsaX_udpdb_open_buffer (dsaX_write_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); + + if (ctx->block_open) + { + syslog (LOG_ERR, "open_buffer: buffer already opened"); + return -1; + } + + if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); + + uint64_t block_id = 0; + + wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); + if (!wblock) + { + syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); + return -1; + } + + ctx->block_open = 1; + + return 0; +} + +/* + * close a data buffer, assuming a full block has been written + */ +int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod); +int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); + + if (!ctx->block_open) + { + syslog (LOG_ERR, "close_buffer: buffer already closed"); + return -1; + } + + // log any buffers that are not full, except for the 1 byte "EOD" buffer + if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) + syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " + "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", + bytes_written, ctx->hdu_bufsz); + + if (eod) + { + if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); + return -1; + } + } + else + { + if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); + return -1; + } + } + + wblock = 0; + ctx->block_open = 0; + + return 0; +} + +/* + * move to the next ring buffer element. return pointer to base address of new buffer + */ +int dsaX_udpdb_new_buffer (dsaX_write_t * ctx); +int dsaX_udpdb_new_buffer (dsaX_write_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); + + if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); + return -1; + } + + if (dsaX_udpdb_open_buffer (ctx) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); + return -1; + } + + return 0; + +} + +// increment counters when block is full +void dsaX_udpdb_increment (udpdb_t * ctx); +void dsaX_udpdb_increment (udpdb_t * ctx) +{ + + // increment buffer byte markers + writeBlock++; + block_start_byte = block_end_byte + UDP_DATA; + block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; + block_count = 0; + +} + + + +/* --------- THREADS -------- */ + +// STATS THREAD + +/* + * Thread to print simple capture statistics + */ +void stats_thread(void * arg) { + + dsaX_stats_t * ctx = (dsaX_stats_t *) arg; + uint64_t b_rcv_total = 0; + uint64_t b_rcv_1sec = 0; + uint64_t b_rcv_curr = 0; + + uint64_t b_drp_total = 0; + uint64_t b_drp_1sec = 0; + uint64_t b_drp_curr = 0; + + uint64_t s_rcv_total = 0; + uint64_t s_rcv_1sec = 0; + uint64_t s_rcv_curr = 0; + + uint64_t ooo_pkts = 0; + float gb_rcv_ps = 0; + float mb_rcv_ps = 0; + float mb_drp_ps = 0; + + syslog(LOG_INFO,"starting stats thread..."); + sleep(2); + syslog(LOG_INFO,"started stats thread..."); + + while (!quit_threads) + { + + /* get a snapshot of the data as quickly as possible */ + b_rcv_curr = ctx->bytes->received; + b_drp_curr = ctx->bytes->dropped; + + /* calc the values for the last second */ + b_rcv_1sec = b_rcv_curr - b_rcv_total; + b_drp_1sec = b_drp_curr - b_drp_total; + + /* update the totals */ + b_rcv_total = b_rcv_curr; + b_drp_total = b_drp_curr; + + mb_rcv_ps = (double) b_rcv_1sec / 1000000; + mb_drp_ps = (double) b_drp_1sec / 1000000; + gb_rcv_ps = b_rcv_1sec * 8; + gb_rcv_ps /= 1000000000; + + /* determine how much memory is free in the receivers */ + syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct); + + sleep(1); + } + +} + +// CONTROL THREAD + +void control_thread (void * arg) { + + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = cPort; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + + // INTERPRET BUFFER STRING + // receive either UTC_START, UTC_STOP, MONITOR + + // interpret buffer string + char * rest = buffer; + char *cmd, *val; + cmd = strtok_r(rest, "-", &rest); + val = strtok_r(rest, "-", &rest); + syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); + + if (strcmp(cmd,"UTC_START")==0) + UTC_START = strtoull(val,&endptr,0); + + if (strcmp(cmd,"UTC_STOP")==0) + UTC_STOP = strtoull(val,&endptr,0); + + close(fd); + + } + + free (buffer); + + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +/* + * Thread to capture data + */ +int recv_thread(void * arg) { + + udpdb_t * udpdb = (udpdb_t *) arg; + int thread_id = udpdb->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + int core_id; + if (dPort==4011) + core_id = cores[thread_id]; + else + core_id = cores[thread_id+nth]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + // set up socket + dsaX_sock_t * sock = dsaX_make_sock(udpdb); + + // lookup table for ant order + uint64_t ant_lookup[100], vv; + for (int i=0;i<100;i++) ant_lookup[i] = 0; + for (int i=0;ibuf; + size_t got = 0; // data received from a recv_from call + int errsv; // determine the sequence number boundaries for curr and next buffers + int64_t byte_offset = 0; // offset of current packet in bytes from start of block + uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs + // for "saving" out of order packets near edges of blocks + unsigned int temp_idx = 0; + unsigned int temp_max = 500; + char ** temp_buffers; + uint64_t * temp_seq_byte; + temp_buffers = (char **)malloc(sizeof(char *)*temp_max); + for (int i=0;ihave_packet = 0; + + // incredibly tight loop to try and get a packet + while (!sock->have_packet) + { + + // receive 1 packet into the socket buffer + got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); + + if (got == UDP_PAYLOAD) + { + sock->have_packet = 1; + } + else if (got == -1) + { + errsv = errno; + if (errsv == EAGAIN) + { + if (capture_started) + timeouts++; + //if (timeouts > timeout_max) + //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); + } + else + { + //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); + return EXIT_FAILURE; + } + } + else // we received a packet of the WRONG size, ignore it + { + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); + } + } + timeouts = 0; + + // we have a valid packet within the timeout + if (sock->have_packet) + { + + // decode packet header (64 bits) + // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) + seq_no = 0; + seq_no |= (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7; + seq_no |= (((uint64_t)(sock->buf[3])) << 3) & 2040; + seq_no |= (((uint64_t)(sock->buf[2])) << 11) & 522240; + seq_no |= (((uint64_t)(sock->buf[1])) << 19) & 133693440; + seq_no |= (((uint64_t)(sock->buf[0])) << 27) & 34225520640; + ant_id = 0; + ant_id |= (unsigned char) (sock->buf[6]) << 8; + ant_id |= (unsigned char) (sock->buf[7]); + aid = ant_lookup[(int)(ant_id)]; + //aid = ant_id/3; + + if (UTC_START==0) UTC_START = seq_no+30000; + + act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no + block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block + + // set shared last_seq + pthread_mutex_lock(&mutex); + last_seq = seq_no; + //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq); + pthread_mutex_unlock(&mutex); + + // check for starting or stopping condition, using continue + if (canWrite==0) { + if (seq_no >= UTC_START-50 && UTC_START != 10000) { + canWrite=1; + } + } + if (canWrite == 0) continue; + + // threadsafe start of capture + pthread_mutex_lock(&mutex); + if (!(capture_started)) + { + block_start_byte = block_seq_no * UDP_DATA; + block_end_byte = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; + capture_started = 1; + + syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte); + } + pthread_mutex_unlock(&mutex); + + // if capture running + if (capture_started) + { + seq_byte = (act_seq_no * UDP_DATA); + tpack++; + + // packet belongs in this block + if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte)) + { + byte_offset = seq_byte - (block_start_byte); + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA); + pthread_mutex_lock(&mutex); + block_count++; + //syslog(LOG_INFO,"block count %"PRIu64"",block_count); + pthread_mutex_unlock(&mutex); + + } + // packet belongs in subsequent block + else if (seq_byte > block_end_byte) + { + + if (temp_idx < temp_max) + { + // save packet to temp buffer + memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA); + temp_seq_byte[temp_idx] = seq_byte; + temp_idx++; + } + } + // packet is too late + /*else + { + if (ctAnts<100) { + syslog (LOG_INFO, "receive_obs: TOO LATE %"PRIu64" %"PRIu64"", seq_no, ant_id); + ctAnts++; + } + }*/ + } + + // threadsafe end of block + pthread_mutex_lock(&mutex); + if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) + { + syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", " + "ant_id=%"PRIu16", block_count=%"PRIu64", " + "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id, block_count, + temp_idx,writeBlock); + + // write block + // check whether doWrite has been released. If not, skip this block + if (blockStatus[writeBlock % 64] > 0) + blockStatus[writeBlock % 64] += 1; + else + blockStatus[writeBlock % 64] = 1; + + uint64_t dropped = udpdb->packets_per_buffer - (block_count); + udpdb->packets->received += (block_count); + udpdb->bytes->received += (block_count) * UDP_DATA; + if (dropped) + { + udpdb->packets->dropped += dropped; + udpdb->bytes->dropped += (dropped * UDP_DATA); + } + + // increment counters + dsaX_udpdb_increment(udpdb); + ctAnts = 0; + + // write temp queue for this thread + //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); + tpack = 0; + + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - (block_start_byte); + if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) + { + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); + //pthread_mutex_lock(&mutex); + block_count++; + //pthread_mutex_unlock(&mutex); + } + } + temp_idx = 0; + + } + pthread_mutex_unlock(&mutex); + + // at this stage, can try and write temp queue safely for other threads + if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0) + { + //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); + tpack = 0; + + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - (block_start_byte); + if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) + { + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); + pthread_mutex_lock(&mutex); + block_count++; + pthread_mutex_unlock(&mutex); + } + } + temp_idx = 0; + + } + + } + + // packet has been inserted or saved by this point + sock->have_packet = 0; + + } + + dsaX_free_sock(sock); + free(temp_buffers); + free(temp_seq_byte); + +} + +/* + * Thread to write data + */ +void write_thread(void * arg) { + + dsaX_write_t * udpdb = (dsaX_write_t *) arg; + int thread_id = udpdb->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + int core_id; + if (dPort==4011) + core_id = write_cores[thread_id]; + else + core_id = write_cores[thread_id+nwth]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + int mod_WB = 0; + int a; + + while (!quit_threads) + { + + mod_WB = lWriteBlock % 64; + + while (blockStatus[mod_WB]==0) { + a=1; + } + + // assume everything is set up + // wblock is assigned, write_ct=0 + + memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth); + + pthread_mutex_lock(&mutex); + write_ct++; + pthread_mutex_unlock(&mutex); + + //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id); + + // now wait until thread 0 has finished getting a new block before moving on + if (thread_id>0) { + while (write_ct!=0) a=1; + } + else { + + // wait for all sub-blocks to be written + while (write_ct= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // OPEN CONNECTION TO DADA DB FOR WRITING + + if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); + + hdu_out = dada_hdu_create (0); + if (DEBUG) syslog(DEBUG,"Created hdu"); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog(LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_out); + syslog(LOG_ERR,"could not lock to output dada buffer"); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"opened connection to output DB"); + + // DEAL WITH DADA HEADER + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + if (DEBUG) syslog(DEBUG,"read header2"); + + if (fileread (dada_fnam, hout, 4096) < 0) + { + free (hout); + syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); + return (EXIT_FAILURE); + } + + + if (DEBUG) syslog(DEBUG,"read header3"); + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // copy the in header to the out header + memcpy (header_out, hout, 4096); + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) + { + syslog(LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + sprintf(STATE,"LISTEN"); + syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); + + + /* time to start up receiver. + data are captured on iface:CAPTURE_PORT + */ + + // make recv, write, and stats structs + udpdb_t udpdb[nth]; + dsaX_stats_t stats; + dsaX_write_t writey[nwth]; + + // shared variables and memory + uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + char * tblock = (char *)malloc(sizeof(char)*bufsz*64); + stats_t * packets = init_stats_t(); + stats_t * bytes = init_stats_t(); + reset_stats_t(packets); + reset_stats_t(bytes); + + // initialise stats struct + stats.packets = packets; + stats.bytes = bytes; + + // initialise writey struct and open buffer + for (int i=0;idata_block); + writey[i].block_open = 0; + writey[i].tblock = tblock; + writey[i].thread_id = i; + } + dsaX_udpdb_open_buffer (&writey[0]); + + // initialise all udpdb structs + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture_manythread.h" +#include "dsaX_def.h" + +/* global variables */ +int quit_threads = 0; +char STATE[20]; +uint64_t UTC_START = 10000; +uint64_t UTC_STOP = 40000000000; +int MONITOR = 0; +char iP[100]; +int DEBUG = 0; +int HISTOGRAM[16]; +int writeBlock = 0; +const int nth = 8; +const int nwth = 4; +int cores[8] = {30,31,32,33,34,35,36,37}; +int write_cores[4] = {17,18,19,39}; +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +volatile int doWrite = 0; +volatile int skipBlock = 0; +volatile int skipping = 0; +volatile int lWriteBlock = 0; +volatile int write_ct = 0; +volatile uint64_t last_seq = 0; +volatile int skipct = 0; +volatile uint64_t block_count = 0; +volatile uint64_t block_start_byte=0, block_end_byte=0; +volatile unsigned capture_started = 0; +volatile char * wblock; + +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); +void usage(); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + + + +} + +void usage() +{ + fprintf (stdout, + "dsaX_capture [options]\n" + " -c core bind process to CPU core [no default]\n" + " -j IP to listen on for data packets [no default]\n" + " -i IP to listen on for control commands [no default]\n" + " -f filename of template dada header [no default]\n" + " -o out_key [default CAPTURE_BLOCK_KEY]\n" + " -d send debug messages to syslog\n" + " -g chgroup [default 0]\n" + " -h print usage\n"); +} + +// open a socket +dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx); +dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx) +{ + + // prepare structure + syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure"); + dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); + assert(b != NULL); + b->bufsz = sizeof(char) * UDP_PAYLOAD; + b->buf = (char *) malloc (b->bufsz); + assert(b->buf != NULL); + b->have_packet = 0; + b->fd = 0; + + // connect to socket + syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, ctx->port); + + // open socket + syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); + b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + assert(b->fd>=0); + + // for multiple connections + int one = 1; + setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one)); + + struct sockaddr_in udp_sock; + bzero(&(udp_sock.sin_zero), 8); // clear the struct + udp_sock.sin_family = AF_INET; // internet/IP + udp_sock.sin_port = htons(ctx->port); // set the port number + udp_sock.sin_addr.s_addr = inet_addr(ctx->interface); // from a specific IP address + + if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) { + syslog(LOG_ERR, "prepare: failed to bind to socket"); + return -1; + } + + // set the socket size to 256 MB + int sock_buf_size = 256*1024*1024; + syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); + dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size); + + // set the socket to non-blocking + syslog(LOG_INFO, "prepare: setting non_block"); + sock_nonblock(b->fd); + + // clear any packets buffered by the kernel + syslog(LOG_INFO, "prepare: clearing packets at socket"); + size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD); + + return b; +} + + + +// close a socket +void dsaX_free_sock(dsaX_sock_t* b); +void dsaX_free_sock(dsaX_sock_t* b) +{ + b->fd = 0; + b->bufsz = 0; + b->have_packet =0; + if (b->buf) + free (b->buf); + b->buf = 0; +} + +/* + * open a data block buffer ready for direct access + */ +int dsaX_udpdb_open_buffer (dsaX_write_t * ctx); +int dsaX_udpdb_open_buffer (dsaX_write_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); + + if (ctx->block_open) + { + syslog (LOG_ERR, "open_buffer: buffer already opened"); + return -1; + } + + if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); + + uint64_t block_id = 0; + + wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); + if (!wblock) + { + syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); + return -1; + } + + ctx->block_open = 1; + + return 0; +} + +/* + * close a data buffer, assuming a full block has been written + */ +int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod); +int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); + + if (!ctx->block_open) + { + syslog (LOG_ERR, "close_buffer: buffer already closed"); + return -1; + } + + // log any buffers that are not full, except for the 1 byte "EOD" buffer + if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) + syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " + "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", + bytes_written, ctx->hdu_bufsz); + + if (eod) + { + if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); + return -1; + } + } + else + { + if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); + return -1; + } + } + + wblock = 0; + ctx->block_open = 0; + + return 0; +} + +/* + * move to the next ring buffer element. return pointer to base address of new buffer + */ +int dsaX_udpdb_new_buffer (dsaX_write_t * ctx); +int dsaX_udpdb_new_buffer (dsaX_write_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); + + if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); + return -1; + } + + if (dsaX_udpdb_open_buffer (ctx) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); + return -1; + } + + return 0; + +} + +// increment counters when block is full +void dsaX_udpdb_increment (udpdb_t * ctx); +void dsaX_udpdb_increment (udpdb_t * ctx) +{ + + // increment buffer byte markers + writeBlock++; + block_start_byte = block_end_byte + UDP_DATA; + block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; + block_count = 0; + +} + + + +/* --------- THREADS -------- */ + +// STATS THREAD + +/* + * Thread to print simple capture statistics + */ +void stats_thread(void * arg) { + + dsaX_stats_t * ctx = (dsaX_stats_t *) arg; + uint64_t b_rcv_total = 0; + uint64_t b_rcv_1sec = 0; + uint64_t b_rcv_curr = 0; + + uint64_t b_drp_total = 0; + uint64_t b_drp_1sec = 0; + uint64_t b_drp_curr = 0; + + uint64_t s_rcv_total = 0; + uint64_t s_rcv_1sec = 0; + uint64_t s_rcv_curr = 0; + + uint64_t ooo_pkts = 0; + float gb_rcv_ps = 0; + float mb_rcv_ps = 0; + float mb_drp_ps = 0; + + syslog(LOG_INFO,"starting stats thread..."); + sleep(2); + syslog(LOG_INFO,"started stats thread..."); + + while (!quit_threads) + { + + /* get a snapshot of the data as quickly as possible */ + b_rcv_curr = ctx->bytes->received; + b_drp_curr = ctx->bytes->dropped; + + /* calc the values for the last second */ + b_rcv_1sec = b_rcv_curr - b_rcv_total; + b_drp_1sec = b_drp_curr - b_drp_total; + + /* update the totals */ + b_rcv_total = b_rcv_curr; + b_drp_total = b_drp_curr; + + mb_rcv_ps = (double) b_rcv_1sec / 1000000; + mb_drp_ps = (double) b_drp_1sec / 1000000; + gb_rcv_ps = b_rcv_1sec * 8; + gb_rcv_ps /= 1000000000; + + /* determine how much memory is free in the receivers */ + syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct); + + sleep(1); + } + +} + +// CONTROL THREAD + +void control_thread (void * arg) { + + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = CAPTURE_CONTROL_PORT; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + + // INTERPRET BUFFER STRING + // receive either UTC_START, UTC_STOP, MONITOR + + // interpret buffer string + char * rest = buffer; + char *cmd, *val; + cmd = strtok_r(rest, "-", &rest); + val = strtok_r(rest, "-", &rest); + syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); + + if (strcmp(cmd,"UTC_START")==0) + UTC_START = strtoull(val,&endptr,0); + + if (strcmp(cmd,"UTC_STOP")==0) + UTC_STOP = strtoull(val,&endptr,0); + + close(fd); + + } + + free (buffer); + + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +/* + * Thread to capture data + */ +void recv_thread(void * arg) { + + udpdb_t * udpdb = (udpdb_t *) arg; + int thread_id = udpdb->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + // set up socket + dsaX_sock_t * sock = dsaX_make_sock(udpdb); + + // DEFINITIONS + uint64_t tpack = 0; + uint64_t act_seq_no = 0; + uint64_t block_seq_no = 0; + uint64_t seq_no = 0; + uint64_t ant_id = 0; + unsigned char * b = (unsigned char *) sock->buf; + size_t got = 0; // data received from a recv_from call + int errsv; // determine the sequence number boundaries for curr and next buffers + int64_t byte_offset = 0; // offset of current packet in bytes from start of block + uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs + // for "saving" out of order packets near edges of blocks + unsigned int temp_idx = 0; + unsigned int temp_max = 500; + char ** temp_buffers; + uint64_t * temp_seq_byte; + temp_buffers = (char **)malloc(sizeof(char *)*temp_max); + for (int i=0;ihave_packet = 0; + + // incredibly tight loop to try and get a packet + while (!sock->have_packet) + { + + // receive 1 packet into the socket buffer + got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); + + if (got == UDP_PAYLOAD) + { + sock->have_packet = 1; + } + else if (got == -1) + { + errsv = errno; + if (errsv == EAGAIN) + { + if (capture_started) + timeouts++; + //if (timeouts > timeout_max) + //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); + } + else + { + //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); + return EXIT_FAILURE; + } + } + else // we received a packet of the WRONG size, ignore it + { + syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); + } + } + timeouts = 0; + + // we have a valid packet within the timeout + if (sock->have_packet) + { + + // decode packet header (64 bits) + // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) + seq_no = 0; + seq_no |= (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7; + seq_no |= (((uint64_t)(sock->buf[3])) << 3) & 2040; + seq_no |= (((uint64_t)(sock->buf[2])) << 11) & 522240; + seq_no |= (((uint64_t)(sock->buf[1])) << 19) & 133693440; + seq_no |= (((uint64_t)(sock->buf[0])) << 27) & 34225520640; + ant_id = 0; + ant_id |= (unsigned char) (sock->buf[6]) << 8; + ant_id |= (unsigned char) (sock->buf[7]); + + act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no + block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block + + // set shared last_seq + pthread_mutex_lock(&mutex); + last_seq = seq_no; + //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq); + pthread_mutex_unlock(&mutex); + + // check for starting or stopping condition, using continue + if (canWrite==0) { + if (seq_no >= UTC_START-50 && UTC_START != 10000) { + canWrite=1; + } + } + if (canWrite == 0) continue; + + // threadsafe start of capture + pthread_mutex_lock(&mutex); + if (!(capture_started)) + { + block_start_byte = block_seq_no * UDP_DATA; + block_end_byte = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; + capture_started = 1; + + syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte); + } + pthread_mutex_unlock(&mutex); + + // if capture running + if (capture_started) + { + seq_byte = (act_seq_no * UDP_DATA); + tpack++; + + // packet belongs in this block + if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte)) + { + byte_offset = seq_byte - (block_start_byte); + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA); + pthread_mutex_lock(&mutex); + block_count++; + //syslog(LOG_INFO,"block count %"PRIu64"",block_count); + pthread_mutex_unlock(&mutex); + + } + // packet belongs in subsequent block + else if (seq_byte > block_end_byte) + { + + if (temp_idx < temp_max) + { + // save packet to temp buffer + memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA); + temp_seq_byte[temp_idx] = seq_byte; + temp_idx++; + } + } + } + + // threadsafe end of block + pthread_mutex_lock(&mutex); + if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) + { + syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", " + "ant_id=%"PRIu16", block_count=%"PRIu64", " + "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id, block_count, + temp_idx,writeBlock); + + // write block + // check whether doWrite has been released. If not, skip this block + if (doWrite==1) skipBlock=1; + else doWrite=1; + + uint64_t dropped = udpdb->packets_per_buffer - (block_count); + udpdb->packets->received += (block_count); + udpdb->bytes->received += (block_count) * UDP_DATA; + if (dropped) + { + udpdb->packets->dropped += dropped; + udpdb->bytes->dropped += (dropped * UDP_DATA); + } + + // increment counters + dsaX_udpdb_increment(udpdb); + + // write temp queue for this thread + //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); + tpack = 0; + + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - (block_start_byte); + if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) + { + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); + //pthread_mutex_lock(&mutex); + block_count++; + //pthread_mutex_unlock(&mutex); + } + } + temp_idx = 0; + + } + pthread_mutex_unlock(&mutex); + + // at this stage, can try and write temp queue safely for other threads + if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0) + { + //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); + tpack = 0; + + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - (block_start_byte); + if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) + { + mod_WB = writeBlock % 64; + memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); + pthread_mutex_lock(&mutex); + block_count++; + pthread_mutex_unlock(&mutex); + } + } + temp_idx = 0; + + } + + } + + // packet has been inserted or saved by this point + sock->have_packet = 0; + + } + + dsaX_free_sock(sock); + free(temp_buffers); + free(temp_seq_byte); + +} + +/* + * Thread to write data + */ +void write_thread(void * arg) { + + dsaX_write_t * udpdb = (dsaX_write_t *) arg; + int thread_id = udpdb->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = write_cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + int mod_WB = 0; + int a; + + while (!quit_threads) + { + + while (!doWrite) { + a=1; + } + + // assume everything is set up + // wblock is assigned, write_ct=0 + + mod_WB = lWriteBlock % 64; + memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth); + + pthread_mutex_lock(&mutex); + write_ct++; + pthread_mutex_unlock(&mutex); + + //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id); + + // now wait until thread 0 has finished getting a new block before moving on + if (thread_id>0) { + while (write_ct!=0) a=1; + } + else { + + // wait for all sub-blocks to be written + while (write_ct= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // OPEN CONNECTION TO DADA DB FOR WRITING + + if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); + + hdu_out = dada_hdu_create (); + if (DEBUG) syslog(DEBUG,"Created hdu"); + dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); + if (dada_hdu_connect (hdu_out) < 0) { + syslog(LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_out); + syslog(LOG_ERR,"could not lock to output dada buffer"); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"opened connection to output DB"); + + // DEAL WITH DADA HEADER + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + if (DEBUG) syslog(DEBUG,"read header2"); + + if (fileread (dada_fnam, hout, 4096) < 0) + { + free (hout); + syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); + return (EXIT_FAILURE); + } + + + if (DEBUG) syslog(DEBUG,"read header3"); + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // copy the in header to the out header + memcpy (header_out, hout, 4096); + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) + { + syslog(LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + sprintf(STATE,"LISTEN"); + syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); + + + /* time to start up receiver. + data are captured on iface:CAPTURE_PORT + */ + + // make recv, write, and stats structs + udpdb_t udpdb[nth]; + dsaX_stats_t stats; + dsaX_write_t writey[nwth]; + + // shared variables and memory + uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + char * tblock = (char *)malloc(sizeof(char)*bufsz*64); + stats_t * packets = init_stats_t(); + stats_t * bytes = init_stats_t(); + reset_stats_t(packets); + reset_stats_t(bytes); + + // initialise stats struct + stats.packets = packets; + stats.bytes = bytes; + + // initialise writey struct and open buffer + for (int i=0;idata_block); + writey[i].block_open = 0; + writey[i].tblock = tblock; + writey[i].thread_id = i; + } + dsaX_udpdb_open_buffer (&writey[0]); + + // initialise all udpdb structs + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +/* socket buffer for receiving udp data */ +// this is initialised in each recv thread +typedef struct { + + int fd; // FD of the socket + size_t bufsz; // size of socket buffer + char * buf; // the socket buffer + int have_packet; // + size_t got; // amount of data received + +} dsaX_sock_t; + +dsaX_sock_t * dsaX_init_sock (); +void dsaX_free_sock(dsaX_sock_t* b); + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +// structure for write thread +// tblock must be shared +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * block; // pointer to current datablock buffer + char * tblock; // area of memory to write to + int thread_id; + +} dsaX_write_t; + +// structure for stats thread +// both are shared between all recv structures and this one +// last_seq is also shared +typedef struct { + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t * last_seq; // most recently received seq number + +} dsaX_stats_t; + + +// structure for receive thread +// tblock, packets, bytes, last_seq, block_start_byte, block_end_byte, block_count, capture_started +typedef struct { + + multilog_t * log; // DADA logging interface + int verbose; // verbosity flag + + int port; // port to receive UDP data + int control_port; // port to receive control commands + char * interface; // IP Address to accept packets on + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // datablock management + uint64_t * block_start_byte; // seq_byte of first byte for the block + uint64_t * block_end_byte; // seq_byte of first byte of final packet of the block + uint64_t * block_count; // number of packets in this block + uint64_t hdu_bufsz; + char * tblock; // area of memory to write to + + // packets + unsigned * capture_started; // flag for start of UDP data + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t rcv_sleeps; + + uint64_t * last_seq; // most recently received seq number + struct timeval timeout; + int thread_id; + +} udpdb_t; + +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); + +#endif diff --git a/legacy/dsaX_capture_pcap.c b/legacy/dsaX_capture_pcap.c new file mode 100644 index 0000000..4921c68 --- /dev/null +++ b/legacy/dsaX_capture_pcap.c @@ -0,0 +1,852 @@ +/* dsaX_capture_pcap.c: Code to capture packets using pf_ring aware pcap and write to a dada buffer. + +control and stats threads: standard threads +recv thread: simply runs pcap_loop, passing packets to callback function +packet_callback: places packets directly into dada buffer, or temp buffer. gets new buffer if needed + +everything is in the dsaX_t structure + + +*/ + +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture_pcap.h" +#include "dsaX_def.h" +#include "pcap.h" + +/* global variables */ +int quit_threads = 0; +char STATE[20]; +uint64_t UTC_START = 10000; +uint64_t UTC_STOP = 40000000000; +int MONITOR = 0; +char iP[100]; +int DEBUG = 0; +int HISTOGRAM[16]; +int cores[2] = {17,19}; +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +volatile int canWrite = 0; +volatile unsigned capture_started = 0; +volatile char * wblock; +volatile uint64_t last_seq; +const int nth = 1; +const int nwth = 1; +const int TEMP_MAXY = 1000; +volatile int skipped = 0; +const int NBLOCKS = 8; +volatile uint64_t writeBlock[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +volatile int delayBlock = 0; +volatile int behindBlock = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); +void usage(); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_capture [options]\n" + " -c core bind process to CPU core [no default]\n" + " -i IP to listen on for control commands [no default]\n" + " -f filename of template dada header [no default]\n" + " -o out_key [default CAPTURE_BLOCK_KEY]\n" + " -d send debug messages to syslog\n" + " -h print usage\n"); +} + +/* + * open a data block buffer ready for direct access + */ +int dsaX_udpdb_open_buffer (dsaX_t * ctx); +int dsaX_udpdb_open_buffer (dsaX_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); + + if (ctx->block_open) + { + syslog (LOG_ERR, "open_buffer: buffer already opened"); + return -1; + } + + if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); + + uint64_t block_id = 0; + + wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); + if (!wblock) + { + syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); + return -1; + } + + ctx->block_open = 1; + + return 0; +} + +/* + * close a data buffer, assuming a full block has been written + */ +int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod); +int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); + + if (!ctx->block_open) + { + syslog (LOG_ERR, "close_buffer: buffer already closed"); + return -1; + } + + // log any buffers that are not full, except for the 1 byte "EOD" buffer + if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) + syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " + "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", + bytes_written, ctx->hdu_bufsz); + + if (eod) + { + if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); + return -1; + } + } + else + { + if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); + return -1; + } + } + + wblock = 0; + ctx->block_open = 0; + + return 0; +} + +/* + * move to the next ring buffer element. return pointer to base address of new buffer + */ +int dsaX_udpdb_new_buffer (dsaX_t * ctx); +int dsaX_udpdb_new_buffer (dsaX_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); + + if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); + return -1; + } + + if (dsaX_udpdb_open_buffer (ctx) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); + return -1; + } + + return 0; + +} + +// increment counters when block is full +void dsaX_udpdb_increment (dsaX_t * ctx); +void dsaX_udpdb_increment (dsaX_t * ctx) +{ + + // increment buffer byte markers + ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; + ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; + ctx->block_count = 0; + +} + + + +/* --------- THREADS -------- */ + +// STATS THREAD + +/* + * Thread to print simple capture statistics + */ +void stats_thread(void * arg) { + + dsaX_stats_t * ctx = (dsaX_stats_t *) arg; + uint64_t b_rcv_total = 0; + uint64_t b_rcv_1sec = 0; + uint64_t b_rcv_curr = 0; + + uint64_t b_drp_total = 0; + uint64_t b_drp_1sec = 0; + uint64_t b_drp_curr = 0; + + uint64_t s_rcv_total = 0; + uint64_t s_rcv_1sec = 0; + uint64_t s_rcv_curr = 0; + + uint64_t ooo_pkts = 0; + float gb_rcv_ps = 0; + float mb_rcv_ps = 0; + float mb_drp_ps = 0; + + syslog(LOG_INFO,"starting stats thread..."); + sleep(2); + syslog(LOG_INFO,"started stats thread..."); + + while (!quit_threads) + { + + /* get a snapshot of the data as quickly as possible */ + b_rcv_curr = ctx->bytes->received; + b_drp_curr = ctx->bytes->dropped; + + /* calc the values for the last second */ + b_rcv_1sec = b_rcv_curr - b_rcv_total; + b_drp_1sec = b_drp_curr - b_drp_total; + + /* update the totals */ + b_rcv_total = b_rcv_curr; + b_drp_total = b_drp_curr; + + mb_rcv_ps = (double) b_rcv_1sec / 1000000; + mb_drp_ps = (double) b_drp_1sec / 1000000; + gb_rcv_ps = b_rcv_1sec * 8; + gb_rcv_ps /= 1000000000; + + /* determine how much memory is free in the receivers */ + syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, behindBlock, skipped); + + sleep(1); + } + +} + +// CONTROL THREAD + +void control_thread (void * arg) { + + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = CAPTURE_CONTROL_PORT; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + + // INTERPRET BUFFER STRING + // receive either UTC_START, UTC_STOP, MONITOR + + // interpret buffer string + char * rest = buffer; + char *cmd, *val; + cmd = strtok_r(rest, "-", &rest); + val = strtok_r(rest, "-", &rest); + syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); + + if (strcmp(cmd,"UTC_START")==0) + UTC_START = strtoull(val,&endptr,0); + + if (strcmp(cmd,"UTC_STOP")==0) + UTC_STOP = strtoull(val,&endptr,0); + + close(fd); + + } + + free (buffer); + + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + +/* +This is important - packet callback function to place packets in buffer +called upon single packet being received +*/ +void packet_callback(u_char *args, const struct pcap_pkthdr* header, const u_char* packet) { + + dsaX_t * udpdb = (dsaX_t *) args; + + // make sure packet has right length and get payload + if (header->len != UDP_PAYLOAD + 42) { + syslog(LOG_INFO,"received packet with length %d, total available %d",header->len,header->caplen); + return; + } + char *buf = (char *)(packet + 42); + + // process packet header + uint64_t seq_no=0, ant_id=0; + seq_no |= (((uint64_t)(buf[4]) & 224) >> 5) & 7; + seq_no |= (((uint64_t)(buf[3])) << 3) & 2040; + seq_no |= (((uint64_t)(buf[2])) << 11) & 522240; + seq_no |= (((uint64_t)(buf[1])) << 19) & 133693440; + seq_no |= (((uint64_t)(buf[0])) << 27) & 34225520640; + ant_id |= (unsigned char) (buf[6]) << 8; + ant_id |= (unsigned char) (buf[7]); + uint64_t act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no + uint64_t block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block + last_seq = seq_no; + + // check for starting condition + if (canWrite==0) { + if (seq_no >= UTC_START-500 && UTC_START != 10000) { + canWrite=1; + } + } + if (canWrite == 0) return; + + // deal with start of capture + if (!(capture_started)) + { + udpdb->block_start_byte = block_seq_no * UDP_DATA; + udpdb->block_end_byte = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; + capture_started = 1; + syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte); + } + + // if capture has started, do good stuff + uint64_t byte_offset, seq_byte; + if (capture_started) { + + seq_byte = (act_seq_no * UDP_DATA); + + // packet belongs in this block + if ((seq_byte <= udpdb->block_end_byte) && (seq_byte >= udpdb->block_start_byte)) + { + byte_offset = seq_byte - (udpdb->block_start_byte); + memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, buf + UDP_HEADER, UDP_DATA); + //memcpy(wblock + byte_offset, buf + UDP_HEADER, UDP_DATA); + udpdb->block_count++; + } + // packet belongs in subsequent block + else if (seq_byte > udpdb->block_end_byte) + { + if (udpdb->temp_idx < TEMP_MAXY) + { + // save packet to temp buffer + memcpy (udpdb->temp_buffers + udpdb->temp_idx*UDP_DATA, buf + UDP_HEADER, UDP_DATA); + udpdb->temp_seq_byte[udpdb->temp_idx] = seq_byte; + udpdb->temp_idx++; + } + } + } + + // end of block + if ((udpdb->block_count >= udpdb->packets_per_buffer) || (udpdb->temp_idx >= TEMP_MAXY)) + { + syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " + "ant_id=%"PRIu16", block_count=%"PRIu64", " + "temp_idx=%d", seq_no, ant_id, + udpdb->block_count, udpdb->temp_idx); + + // set write block on this block + if (writeBlock[udpdb->tblock_idx]==1) + skipped++; + writeBlock[udpdb->tblock_idx] = 1; + + // increment tblock_idx + udpdb->tblock_idx+=1; + if (udpdb->tblock_idx==NBLOCKS) + udpdb->tblock_idx = 0; + + // get delay_block + udpdb->nblocks_written++; + behindBlock = udpdb->nblocks_written - delayBlock; + + // deal with counters + uint64_t dropped = udpdb->packets_per_buffer - (udpdb->block_count); + udpdb->packets->received += (udpdb->block_count); + udpdb->bytes->received += (udpdb->block_count) * UDP_DATA; + if (dropped) + { + udpdb->packets->dropped += dropped; + udpdb->bytes->dropped += (dropped * UDP_DATA); + } + dsaX_udpdb_increment(udpdb); + + // write temp queue + for (int i=0; i < udpdb->temp_idx; i++) { + seq_byte = udpdb->temp_seq_byte[i]; + byte_offset = seq_byte - udpdb->block_start_byte; + if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) { + memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, udpdb->temp_buffers + i*UDP_DATA, UDP_DATA); + udpdb->block_count++; + } + } + udpdb->temp_idx = 0; + + } + +} + +// Thread to do writing + +void write_thread(void * arg) { + + dsaX_t * udpdb = (dsaX_t *) arg; + int thread_id = 2; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[1]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + int a, lWriteBlock=0; + while (!quit_threads) { + + // busywait + while (writeBlock[lWriteBlock]==0) + a=1; + + // write block + memcpy(wblock, udpdb->tblock + lWriteBlock*UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK, UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK); + + // get new block + if (dsaX_udpdb_new_buffer (udpdb) < 0) + { + syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); + return EXIT_FAILURE; + } + + // increment counters + writeBlock[lWriteBlock] = 0; + lWriteBlock++; + if (lWriteBlock==NBLOCKS) + lWriteBlock = 0; + delayBlock++; + + } +} + +/* +Thread to run pcap, passing to callback function +*/ + +void pcap_thread(void * arg) { + + dsaX_t * udpdb = (dsaX_t *) arg; + int thread_id = 1;//udpdb->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[0]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + // set up pcap from port CAPTURE_PORT + char dev[] = "eth0"; + pcap_t *handle; + char error_buffer[PCAP_ERRBUF_SIZE]; + struct bpf_program filter; + char filter_exp[] = "port 4011"; + bpf_u_int32 subnet_mask, ip; + + if (pcap_lookupnet(dev, &ip, &subnet_mask, error_buffer) == -1) { + syslog(LOG_ERR,"Could not get information for device: %s", dev); + ip = 0; + subnet_mask = 0; + } + handle = pcap_open_live(dev, 4659, 0, 1, error_buffer); + if (handle == NULL) { + syslog(LOG_ERR,"Could not open %s - %s", dev, error_buffer); + return 2; + } + + if (pcap_compile(handle, &filter, filter_exp, 1, ip) == -1) { + syslog(LOG_ERR,"Bad filter - %s", pcap_geterr(handle)); + return 2; + } + if (pcap_setfilter(handle, &filter) == -1) { + syslog(LOG_ERR,"Error setting filter - %s\n", pcap_geterr(handle)); + return 2; + } + + /* if((pcap_set_buffer_size(handle, 2*1024*1024))!=0) + { + syslog(LOG_ERR, "Could not set buffer size"); + return 2; + }*/ + + + syslog(LOG_INFO,"thread %d: successfully set up pcap",thread_id); + + // start up RX! + while (!quit_threads) + pcap_loop(handle, 0, packet_callback, (u_char*)udpdb); + + // finish + pcap_close(handle); + +} + + + +// MAIN of program + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_capture_pcap", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit for writing */ + dada_hdu_t* hdu_out = 0; + + // input data block HDU key + key_t out_key = CAPTURE_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg=0; + char dada_fnam[200]; // filename for dada header + + while ((arg=getopt(argc,argv,"c:i:f:o:dh")) != -1) + { + switch (arg) + { + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + strcpy(iP,optarg); + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + strcpy(dada_fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // START THREADS + + // start control thread + int rval = 0; + pthread_t control_thread_id; + dsaX_t temp_str; + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str); + if (rval != 0) { + syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT); + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // OPEN CONNECTION TO DADA DB FOR WRITING + + if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); + + hdu_out = dada_hdu_create (); + if (DEBUG) syslog(DEBUG,"Created hdu"); + dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); + if (dada_hdu_connect (hdu_out) < 0) { + syslog(LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_out); + syslog(LOG_ERR,"could not lock to output dada buffer"); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"opened connection to output DB"); + + // DEAL WITH DADA HEADER + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + if (DEBUG) syslog(DEBUG,"read header2"); + + if (fileread (dada_fnam, hout, 4096) < 0) + { + free (hout); + syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); + return (EXIT_FAILURE); + } + + + if (DEBUG) syslog(DEBUG,"read header3"); + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // copy the in header to the out header + memcpy (header_out, hout, 4096); + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) + { + syslog(LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + sprintf(STATE,"LISTEN"); + syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); + + + /* time to start up receiver. + */ + + // make recv, write, and stats structs + dsaX_t udpdb[nth]; + dsaX_stats_t stats; + + // shared variables and memory + uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + stats_t * packets = init_stats_t(); + stats_t * bytes = init_stats_t(); + reset_stats_t(packets); + reset_stats_t(bytes); + char * tblock = (char *)malloc(sizeof(char)*NBLOCKS*(ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block))); + char * temp_buffers = (char *)malloc(sizeof(char)*TEMP_MAXY*UDP_DATA); + char * temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*TEMP_MAXY); + + // initialise stats struct + stats.packets = packets; + stats.bytes = bytes; + + for (int i=0;idata_block); + udpdb[i].block_open = 0; + udpdb[i].block_count = 0; + udpdb[i].tblock = tblock; + udpdb[i].tblock_idx = 0; + udpdb[i].temp_buffers = temp_buffers; + udpdb[i].temp_seq_byte = temp_seq_byte; + udpdb[i].temp_idx = 0; + udpdb[i].thread_id = 1; + udpdb[i].verbose = 0; + udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA; + udpdb[i].packets = packets; + udpdb[i].bytes = bytes; + udpdb[i].nblocks_written = 0; + + } + dsaX_udpdb_open_buffer (&udpdb[0]); + + /* start threads */ + + // start the stats thread + pthread_t stats_thread_id; + rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats); + if (rval != 0) { + syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "started stats_thread()"); + + // start the receive threads + pthread_t recv_thread_id[nth]; + rval = 0; + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "futils.h" +#include "dada_hdu.h" +#include "dada_pwc_main.h" +#include "multilog.h" +#include "ipcio.h" +#include "ascii_header.h" +#include "dada_udp.h" + +#include "dsaX_def.h" + +/* Number of UDP packets to be recived for a called to buffer_function */ +#define NOTRECORDING 0 +#define RECORDING 1 + +// structure for all threads +typedef struct { + + dada_hdu_t * hdu; // DADA Header + Data Unit + uint64_t hdu_bufsz; + unsigned block_open; // if the current data block element is open + char * tblock; + uint64_t tblock_idx; + char * temp_buffers; + uint64_t * temp_seq_byte; + int temp_idx; + int thread_id; + uint64_t block_start_byte; + uint64_t block_end_byte; + uint64_t block_count; + int nblocks_written; + + int verbose; // verbosity flag + + // configuration for number of inputs + unsigned int num_inputs; // number of antennas / inputs + + // packets + uint64_t packets_per_buffer; // number of UDP packets per datablock buffer + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + + uint64_t last_seq; // most recently received seq number + +} dsaX_t; + +// structure for stats thread +// both are shared between all recv structures and this one +// last_seq is also shared +typedef struct { + + /* Packet and byte statistics */ + stats_t * packets; + stats_t * bytes; + uint64_t * last_seq; // most recently received seq number + +} dsaX_stats_t; + + +void signal_handler (int signalValue); +void stats_thread(void * arg); +void control_thread(void * arg); diff --git a/legacy/dsaX_capture_thread.c b/legacy/dsaX_capture_thread.c new file mode 100644 index 0000000..49019be --- /dev/null +++ b/legacy/dsaX_capture_thread.c @@ -0,0 +1,1107 @@ +/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer. + +main: runs capture loop, and interfaces dada buffer +control_thread: deals with control commands + +*/ + +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +/* global variables */ +int quit_threads = 0; +char STATE[20]; +uint64_t UTC_START = 10000; +uint64_t UTC_STOP = 40000000000; +int MONITOR = 0; +char iP[100]; +int DEBUG = 0; +int HISTOGRAM[16]; +int writeBlock = 0; +volatile int doWrite = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + + + +} + +void usage() +{ + fprintf (stdout, + "dsaX_capture [options]\n" + " -c core bind process to CPU core [no default]\n" + " -j IP to listen on for data packets [no default]\n" + " -i IP to listen on for control commands [no default]\n" + " -f filename of template dada header [no default]\n" + " -o out_key [default CAPTURE_BLOCK_KEY]\n" + " -d send debug messages to syslog\n" + " -g chgroup [default 0]\n" + " -h print usage\n"); +} + +/* + * create a socket with the specified number of buffers + */ +dsaX_sock_t * dsaX_init_sock () +{ + dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); + assert(b != NULL); + + b->bufsz = sizeof(char) * UDP_PAYLOAD; + + b->buf = (char *) malloc (b->bufsz); + assert(b->buf != NULL); + + b->have_packet = 0; + b->fd = 0; + + return b; +} + +void dsaX_free_sock(dsaX_sock_t* b) +{ + b->fd = 0; + b->bufsz = 0; + b->have_packet =0; + if (b->buf) + free (b->buf); + b->buf = 0; +} + +/* + * intialize UDP receiver resources + */ +int dsaX_udpdb_init_receiver (udpdb_t * ctx) +{ + syslog(LOG_INFO,"dsax_udpdb_init_receiver()"); + + // create a dsaX socket which can hold variable num of UDP packet + ctx->sock = dsaX_init_sock(); + + ctx->ooo_packets = 0; + ctx->recv_core = -1; + ctx->n_sleeps = 0; + ctx->mb_rcv_ps = 0; + ctx->mb_drp_ps = 0; + ctx->block_open = 0; + ctx->block_count = 0; + ctx->capture_started = 0; + ctx->last_seq = 0; + ctx->last_byte = 0; + ctx->block_start_byte = 0; + + // allocate required memory strucutres + ctx->packets = init_stats_t(); + ctx->bytes = init_stats_t(); + return 0; +} + +/* +prepare socket and writer +*/ + +int dsaX_udpdb_prepare (udpdb_t * ctx) +{ + syslog(LOG_INFO, "dsaX_udpdb_prepare()"); + + // open socket + syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); + ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose); + if (ctx->sock->fd < 0) { + syslog (LOG_ERR, "Error, Failed to create udp socket"); + return -1; + } + + + // set the socket size to 256 MB + int sock_buf_size = 256*1024*1024; + syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); + dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size); + + // set the socket to non-blocking + syslog(LOG_INFO, "prepare: setting non_block"); + sock_nonblock(ctx->sock->fd); + + // clear any packets buffered by the kernel + syslog(LOG_INFO, "prepare: clearing packets at socket"); + size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD); + + // setup the next_seq to the initial value + //ctx->last_seq = 0; + //ctx->last_byte = 0; + //ctx->n_sleeps = 0; + + return 0; +} + +/* + * reset receiver before an observation commences + */ +void dsaX_udpdb_reset_receiver (udpdb_t * ctx) +{ + syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()"); + + ctx->capture_started = 0; + ctx->last_seq = 0; + ctx->last_byte = 0; + ctx->n_sleeps = 0; + + reset_stats_t(ctx->packets); + reset_stats_t(ctx->bytes); +} + +/* + * open a data block buffer ready for direct access + */ +int dsaX_udpdb_open_buffer (udpdb_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); + + if (ctx->block_open) + { + syslog (LOG_ERR, "open_buffer: buffer already opened"); + return -1; + } + + if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); + + uint64_t block_id = 0; + + ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id); + if (!ctx->block) + { + syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); + return -1; + } + + ctx->block_open = 1; + + return 0; +} + +/* + * close a data buffer, assuming a full block has been written + */ +int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); + + if (!ctx->block_open) + { + syslog (LOG_ERR, "close_buffer: buffer already closed"); + return -1; + } + + // log any buffers that are not full, except for the 1 byte "EOD" buffer + if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) + syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " + "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", + bytes_written, ctx->hdu_bufsz); + + if (eod) + { + if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); + return -1; + } + } + else + { + if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) + { + syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); + return -1; + } + } + + ctx->block = 0; + ctx->block_open = 0; + + return 0; +} + +// increment counters when block is full +int dsaX_udpdb_increment (udpdb_t * ctx) +{ + + // increment buffer byte markers + ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; + ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; + ctx->block_count = 0; + if (writeBlock==0) writeBlock=1; + else writeBlock=0; + +} + +/* + * move to the next ring buffer element. return pointer to base address of new buffer + */ +int dsaX_udpdb_new_buffer (udpdb_t * ctx) +{ + + if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); + + if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); + return -1; + } + + if (dsaX_udpdb_open_buffer (ctx) < 0) + { + syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); + return -1; + } + + + // set block to 0 + //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte); + + if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", + ctx->block_start_byte, ctx->block_end_byte); + + return 0; + +} + +/* + * destroy UDP receiver resources + */ +int dsaX_udpdb_destroy_receiver (udpdb_t * ctx) +{ + if (ctx->sock) + dsaX_free_sock(ctx->sock); + ctx->sock = 0; +} + +/* + * Close the udp socket and file + */ + +int udpdb_stop_function (udpdb_t* ctx) +{ + + syslog(LOG_INFO, "stop: dada_hdu_unlock_write()"); + if (dada_hdu_unlock_write (ctx->hdu) < 0) + { + syslog (LOG_ERR, "stop: could not unlock write on"); + return -1; + } + + // close the UDP socket + close(ctx->sock->fd); + + if (ctx->packets->dropped) + { + double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte; + percent *= 100; + + syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %", + ctx->bytes->dropped, ctx->last_byte, percent); + } + + return 0; +} + + + + +/* --------- THREADS -------- */ + +// STATS THREAD + +/* + * Thread to print simple capture statistics + */ +void stats_thread(void * arg) { + + /* // set affinity + const pthread_t pid = pthread_self(); + const int core_id = 4; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + */ + + udpdb_t * ctx = (udpdb_t *) arg; + uint64_t b_rcv_total = 0; + uint64_t b_rcv_1sec = 0; + uint64_t b_rcv_curr = 0; + + uint64_t b_drp_total = 0; + uint64_t b_drp_1sec = 0; + uint64_t b_drp_curr = 0; + + uint64_t s_rcv_total = 0; + uint64_t s_rcv_1sec = 0; + uint64_t s_rcv_curr = 0; + + uint64_t ooo_pkts = 0; + float gb_rcv_ps = 0; + float mb_rcv_ps = 0; + float mb_drp_ps = 0; + + syslog(LOG_INFO,"starting stats thread..."); + sleep(2); + syslog(LOG_INFO,"started stats thread..."); + + while (!quit_threads) + { + + /* get a snapshot of the data as quickly as possible */ + b_rcv_curr = ctx->bytes->received; + b_drp_curr = ctx->bytes->dropped; + s_rcv_curr = ctx->n_sleeps; + + /* calc the values for the last second */ + b_rcv_1sec = b_rcv_curr - b_rcv_total; + b_drp_1sec = b_drp_curr - b_drp_total; + s_rcv_1sec = s_rcv_curr - s_rcv_total; + + /* update the totals */ + b_rcv_total = b_rcv_curr; + b_drp_total = b_drp_curr; + s_rcv_total = s_rcv_curr; + + mb_rcv_ps = (double) b_rcv_1sec / 1000000; + mb_drp_ps = (double) b_drp_1sec / 1000000; + gb_rcv_ps = b_rcv_1sec * 8; + gb_rcv_ps /= 1000000000; + + /* determine how much memory is free in the receivers */ + syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64"", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq); + + sleep(1); + } + +} + + + + + + + +// CONTROL THREAD + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = CAPTURE_CONTROL_PORT; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + + // INTERPRET BUFFER STRING + // receive either UTC_START, UTC_STOP, MONITOR + + // interpret buffer string + char * rest = buffer; + char *cmd, *val; + cmd = strtok_r(rest, "-", &rest); + val = strtok_r(rest, "-", &rest); + syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); + + if (strcmp(cmd,"UTC_START")==0) + UTC_START = strtoull(val,&endptr,0); + + if (strcmp(cmd,"UTC_STOP")==0) + UTC_STOP = strtoull(val,&endptr,0); + + close(fd); + + } + + free (buffer); + + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +/* + * Thread to capture data + */ +int recv_thread(void * arg) { + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = 34; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + + udpdb_t * udpdb = (udpdb_t *) arg; + + /* START WHAT WAS in RECV THREAD */ + + // DEFINITIONS + + uint64_t act_seq_no = 0; + uint64_t block_seq_no = 0; + uint64_t seq_no = 0; + uint64_t ch_id = 0; + uint64_t ant_id = 0; + unsigned char * b = (unsigned char *) udpdb->sock->buf; + size_t got = 0; // data received from a recv_from call + int errsv; // determine the sequence number boundaries for curr and next buffers + int64_t byte_offset = 0; // offset of current packet in bytes from start of block + uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs + // for "saving" out of order packets near edges of blocks + unsigned int temp_idx = 0; + unsigned int temp_max = 1000; + char ** temp_buffers; //[temp_max][UDP_DATA]; + uint64_t * temp_seq_byte; + temp_buffers = (char **)malloc(sizeof(char *)*temp_max); + for (int i=0;isock->have_packet = 0; + + // incredibly tight loop to try and get a packet + while (!udpdb->sock->have_packet) + { + + // receive 1 packet into the socket buffer + got = recvfrom ( udpdb->sock->fd, udpdb->sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); + + if (got == UDP_PAYLOAD) + { + udpdb->sock->have_packet = 1; + } + else if (got == -1) + { + errsv = errno; + if (errsv == EAGAIN) + { + udpdb->n_sleeps++; + if (udpdb->capture_started) + timeouts++; + if (timeouts > timeout_max) + syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); + } + else + { + syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); + return EXIT_FAILURE; + } + } + else // we received a packet of the WRONG size, ignore it + { + syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); + } + } + timeouts = 0; + + // we have a valid packet within the timeout + if (udpdb->sock->have_packet) + { + + // decode packet header (64 bits) + // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) + seq_no = 0; + seq_no |= (((uint64_t)(udpdb->sock->buf[4]) & 224) >> 5) & 7; + seq_no |= (((uint64_t)(udpdb->sock->buf[3])) << 3) & 2040; + seq_no |= (((uint64_t)(udpdb->sock->buf[2])) << 11) & 522240; + seq_no |= (((uint64_t)(udpdb->sock->buf[1])) << 19) & 133693440; + seq_no |= (((uint64_t)(udpdb->sock->buf[0])) << 27) & 34225520640; + ant_id = 0; + ant_id |= (unsigned char) (udpdb->sock->buf[6]) << 8; + ant_id |= (unsigned char) (udpdb->sock->buf[7]); + + act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no + block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block + + // check for starting or stopping condition, using continue + if (canWrite==0) { + if (seq_no >= UTC_START-50 && UTC_START != 10000) ct_snaps++; + if (ct_snaps >= 10) canWrite=1; + } + udpdb->last_seq = seq_no; + if (canWrite == 0) continue; + + // if first packet + if (!udpdb->capture_started) + { + udpdb->block_start_byte = block_seq_no * UDP_DATA; + udpdb->block_end_byte = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; + udpdb->capture_started = 1; + + syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte); + } + + // if capture running + if (udpdb->capture_started) + { + seq_byte = (act_seq_no * UDP_DATA); + + udpdb->last_byte = seq_byte; + + // if packet arrived too late, ignore + if (seq_byte < udpdb->block_start_byte) + { + udpdb->packets->dropped++; + udpdb->bytes->dropped += UDP_DATA; + } + else + { + // packet belongs in this block + if (seq_byte <= udpdb->block_end_byte) + { + byte_offset = seq_byte - udpdb->block_start_byte; + memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, udpdb->sock->buf + UDP_HEADER, UDP_DATA); + udpdb->packets->received++; + udpdb->bytes->received += UDP_DATA; + udpdb->block_count++; + } + // packet belongs in subsequent block + else + { + + if (temp_idx < temp_max) + { + // save packet to temp buffer + memcpy (temp_buffers[temp_idx], udpdb->sock->buf + UDP_HEADER, UDP_DATA); + temp_seq_byte[temp_idx] = seq_byte; + temp_idx++; + } + else + { + udpdb->packets->dropped++; + udpdb->bytes->dropped += UDP_DATA; + } + } + } + } + + // now check for a full buffer or full temp queue + if ((udpdb->block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) + { + syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " + "ant_id=%"PRIu16", block_count=%"PRIu64", " + "temp_idx=%d\n", seq_no, ant_id, udpdb->block_count, + temp_idx); + + // write block + doWrite=1; + + uint64_t dropped = udpdb->packets_per_buffer - udpdb->block_count; + if (dropped) + { + udpdb->packets->dropped += dropped; + udpdb->bytes->dropped += (dropped * UDP_DATA); + } + + // increment counters + dsaX_udpdb_increment(udpdb); + + // write any temp packets saved + + if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb->block_start_byte, udpdb->block_end_byte); + + // include any futuristic packets we saved + for (i=0; i < temp_idx; i++) + { + seq_byte = temp_seq_byte[i]; + byte_offset = seq_byte - udpdb->block_start_byte; + if (byte_offset < udpdb->hdu_bufsz) + { + memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); + udpdb->block_count++; + udpdb->packets->received++; + udpdb->bytes->received += UDP_DATA; + } + else + { + udpdb->packets->dropped++; + udpdb->bytes->dropped += UDP_DATA; + } + } + temp_idx = 0; + } + + } + + // packet has been inserted or saved by this point + udpdb->sock->have_packet = 0; + + + } + + + free(temp_buffers); + free(temp_seq_byte); + +} + +/* + * Thread to write data + */ +int write_thread(void * arg) { + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = 36; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); + if (CPU_ISSET(core_id, &cpuset)) + syslog(LOG_INFO,"thread %d: successfully set thread",core_id); + + + udpdb_t * udpdb = (udpdb_t *) arg; + int lWriteBlock = 0; + int a; + + while (!quit_threads) + { + + while (!doWrite) { + a=1; + } + + syslog(LOG_INFO,"writing block..."); + + memcpy(udpdb->block, udpdb->tblock + lWriteBlock*udpdb->hdu_bufsz, udpdb->hdu_bufsz); + + if (dsaX_udpdb_new_buffer (udpdb) < 0) + { + syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); + return EXIT_FAILURE; + } + + doWrite=0; + if (lWriteBlock==0) lWriteBlock=1; + else lWriteBlock=0; + + } + +} + + + +// MAIN of program + +int main (int argc, char *argv[]) { + + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_capture_thread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit for writing */ + dada_hdu_t* hdu_out = 0; + + /* actual struct with info */ + udpdb_t udpdb; + + // input data block HDU key + key_t out_key = CAPTURE_BLOCK_KEY; + + // command line arguments + int core = -1; + int chgroup = 0; + int arg=0; + char dada_fnam[200]; // filename for dada header + char iface[100]; // IP for data packets + + while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1) + { + switch (arg) + { + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + strcpy(iP,optarg); + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'g': + if (optarg) + { + chgroup = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-g flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'j': + if (optarg) + { + strcpy(iface,optarg); + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + strcpy(dada_fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // record STATE info + sprintf(STATE,"NOBUFFER"); + + // START THREADS + + // start control thread + int rval = 0; + pthread_t control_thread_id, stats_thread_id; + if (DEBUG) + syslog (LOG_DEBUG, "Creating threads"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT); + + // start the stats thread + rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "started stats_thread()"); + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // initialize the data structure + syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()"); + if (dsaX_udpdb_init_receiver (&udpdb) < 0) + { + syslog (LOG_ERR, "could not initialize receiver"); + return EXIT_FAILURE; + } + + + // OPEN CONNECTION TO DADA DB FOR WRITING + + if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); + + hdu_out = dada_hdu_create (0); + if (DEBUG) syslog(DEBUG,"Created hdu"); + dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); + if (dada_hdu_connect (hdu_out) < 0) { + syslog(LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_out); + syslog(LOG_ERR,"could not lock to output dada buffer"); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"opened connection to output DB"); + + // DEAL WITH DADA HEADER + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + if (DEBUG) syslog(DEBUG,"read header2"); + + if (fileread (dada_fnam, hout, 4096) < 0) + { + free (hout); + syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); + return (EXIT_FAILURE); + } + + + if (DEBUG) syslog(DEBUG,"read header3"); + + + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + + + // copy the in header to the out header + memcpy (header_out, hout, 4096); + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) + { + syslog(LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + sprintf(STATE,"LISTEN"); + syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); + + + /* time to start up receiver. + data are captured on iface:CAPTURE_PORT + */ + + printf("here\n"); + + + // put information in udpdb struct + udpdb.hdu = hdu_out; + udpdb.port = CAPTURE_PORT; + udpdb.interface = strdup(iface); + udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + char * tblock = (char *)malloc(sizeof(char)*udpdb.hdu_bufsz); + udpdb.tblock = tblock; + // determine number of packets per block, must + if (udpdb.hdu_bufsz % UDP_DATA != 0) + { + syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA); + return EXIT_FAILURE; + } + udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA; + udpdb.bytes_to_acquire = 0; + udpdb.num_inputs = NSNAPS; + + // prepare the socket + syslog(LOG_INFO, "main: dsaX_udpdb_prepare()"); + if (dsaX_udpdb_prepare (&udpdb) < 0) + { + syslog(LOG_ERR, "could allocate required resources (prepare)"); + return EXIT_FAILURE; + } + + // reset the receiver + syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()"); + dsaX_udpdb_reset_receiver (&udpdb); + + // open a block of the data block, ready for writing + if (dsaX_udpdb_open_buffer (&udpdb) < 0) + { + syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed"); + return -1; + } + + + // start threads + + // start recv thread + rval = 0; + pthread_t recv_thread_id, write_thread_id; + rval = pthread_create (&recv_thread_id, 0, (void *) recv_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_ERR, "Error creating recv_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "Created recv thread"); + + // start the write thread + rval = pthread_create (&write_thread_id, 0, (void *) write_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval)); + return -1; + } + syslog(LOG_NOTICE, "started write_thread()"); + + while (!quit_threads) { + sleep(1); + } + + // close threads + syslog(LOG_INFO, "joining all threads"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + pthread_join (stats_thread_id, &result); + pthread_join (recv_thread_id, &result); + pthread_join (write_thread_id, &result); + + free(tblock); + + dsaX_dbgpu_cleanup (hdu_out); + +} diff --git a/legacy/dsaX_copydb.c b/legacy/dsaX_copydb.c new file mode 100644 index 0000000..7714038 --- /dev/null +++ b/legacy/dsaX_copydb.c @@ -0,0 +1,273 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block; + uint64_t written, block_id; + + + // set up + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + written = ipcio_write (hdu_out->data_block, block, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_cuda_correlator.cu b/legacy/dsaX_cuda_correlator.cu new file mode 100644 index 0000000..3bebd09 --- /dev/null +++ b/legacy/dsaX_cuda_correlator.cu @@ -0,0 +1,309 @@ +// -*- c++ -*- +/* will run xgpu */ +/* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + +#include +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +//#include "cube/cube.h" +#include "xgpu.h" + + +#ifdef __MACH__ +#include +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 0 +int clock_gettime(int clk_id, struct timespec *t){ + mach_timebase_info_data_t timebase; + mach_timebase_info(&timebase); + uint64_t time; + time = mach_absolute_time(); + double nseconds = ((double)time * (double)timebase.numer)/((double)timebase.denom); + double seconds = ((double)time * (double)timebase.numer)/((double)timebase.denom * 1e9); + t->tv_sec = seconds; + t->tv_nsec = nseconds; + return 0; +} +#else +#include +#endif + +/* + Data ordering for input vectors is (running from slowest to fastest) + [time][channel][station][polarization][complexity] + + Output matrix has ordering + [channel][station][station][polarization][polarization][complexity] +*/ + +int main(int argc, char** argv) { + + int opt; + int i, j; + int device = 0; + unsigned int seed = 1; + int outer_count = 1; + int count = 1; + int syncOp = SYNCOP_SYNC_TRANSFER; + int finalSyncOp = SYNCOP_DUMP; + int verbose = 0; + int hostAlloc = 0; + XGPUInfo xgpu_info; + unsigned int npol, nstation, nfrequency; + int xgpu_error = 0; + Complex *omp_matrix_h = NULL; + struct timespec outer_start, start, stop, outer_stop; + double total, per_call, max_bw, gbps; +#ifdef RUNTIME_STATS + struct timespec tic, toc; +#endif + + while ((opt = getopt(argc, argv, "C:c:d:f:ho:rs:v:")) != -1) { + switch (opt) { + case 'c': + // Set number of time to call xgpuCudaXengine + count = strtoul(optarg, NULL, 0); + if(count < 1) { + fprintf(stderr, "count must be positive\n"); + return 1; + } + break; + case 'C': + // Set number of time to call xgpuCudaXengine + outer_count = strtoul(optarg, NULL, 0); + if(outer_count < 1) { + fprintf(stderr, "outer count must be positive\n"); + return 1; + } + break; + case 'd': + // Set CUDA device number + device = strtoul(optarg, NULL, 0); + break; + case 'f': + // Set syncOp for final call + finalSyncOp = strtoul(optarg, NULL, 0); + break; + case 'o': + // Set syncOp + syncOp = strtoul(optarg, NULL, 0); + break; + case 'r': + // Register host allocated memory + hostAlloc = 1; + break; + case 's': + // Set seed for random data + seed = strtoul(optarg, NULL, 0); + break; + case 'v': + // Set verbosity level + verbose = strtoul(optarg, NULL, 0); + break; + default: /* '?' */ + fprintf(stderr, + "Usage: %s [options]\n" + "Options:\n" + " -c INTEG_CALLS Calls to xgpuCudaXengine per integration [1]\n" + " -C INTEG_COUNT Number of integrations [1]\n" + " -d DEVNUM GPU device to use [0]\n" + " -f FINAL_SYNCOP Sync operation for final call [1]\n" + " -o SYNCOP Sync operation for all but final call [1]\n" + " Sync operation values are:\n" + " 0 (no sync)\n" + " 1 (sync and dump)\n" + " 2 (sync host to device transfer)\n" + " 3 (sync kernel computations)\n" + " -r Register host allocated memory [false]\n" + " (otherwise use CUDA allocated memory)\n" + " -s SEED Random number seed [1]\n" + " -v {0|1|2|3} Verbosity level (debug only) [0]\n" + " -h Show this message\n", + argv[0]); + exit(EXIT_FAILURE); + } + } + + srand(seed); + + // Get sizing info from library + xgpuInfo(&xgpu_info); + npol = xgpu_info.npol; + nstation = xgpu_info.nstation; + nfrequency = xgpu_info.nfrequency; + + printf("Correlating %u stations with %u channels and integration length %u\n", + xgpu_info.nstation, xgpu_info.nfrequency, xgpu_info.ntime); +#ifndef FIXED_POINT + printf("Sending floating point data to GPU.\n"); +#else + printf("Sending fixed point data to GPU.\n"); +#endif + + // perform host memory allocation + + // allocate the GPU X-engine memory + XGPUContext context; + context.array_len = xgpu_info.vecLength; + context.matrix_len = xgpu_info.matLength; + context.array_h = NULL; + context.matrix_h = NULL; + + xgpu_error = xgpuInit(&context, device); + + ComplexInput *array_h = context.array_h; // this is pinned memory + Complex *cuda_matrix_h = context.matrix_h; + + // create an array of complex noise + xgpuRandomComplex(array_h, xgpu_info.vecLength); + + xgpuSwizzleInput(context.array_h, array_h); + + // try copying to GPU + ComplexInput *array_hd; + cudaMalloc((void **)&array_hd, context.array_len*sizeof(ComplexInput)); + cudaMemcpy(array_hd,context.array_h,context.array_len*sizeof(ComplexInput),cudaMemcpyHostToDevice); + + // ompXengine always uses TRIANGULAR_ORDER + unsigned int ompMatLength = nfrequency * ((nstation+1)*(nstation/2)*npol*npol); + omp_matrix_h = (Complex *) malloc(ompMatLength*sizeof(Complex)); + if(!omp_matrix_h) { + fprintf(stderr, "error allocating output buffer for xgpuOmpXengine\n"); + goto cleanup; + } + +#if (CUBE_MODE == CUBE_DEFAULT && !defined(POWER_LOOP) ) + // Only call CPU X engine if dumping GPU X engine exactly once + if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) { + printf("Calling CPU X-Engine\n"); + xgpuOmpXengine(omp_matrix_h, array_h); + } +#endif + +#define ELAPSED_MS(start,stop) \ + ((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6) + + printf("Calling GPU X-Engine\n"); + clock_gettime(CLOCK_MONOTONIC, &outer_start); + for(j=0; j 1) { + clock_gettime(CLOCK_MONOTONIC, &outer_stop); + total = ELAPSED_MS(outer_start,outer_stop); + per_call = total/(count*outer_count); + // per_spectrum = per_call / NTIME + // per_channel = per_spectrum / NFREQUENCY + // = per_call / (NTIME * NFREQUENCY) + // max_bw (kHz) = 1 / per_channel = (NTIME * NFREQUENCY) / per_call + max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz + gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count * outer_count)) / total / 1e6; // Gbps + printf("Elapsed time %.6f ms total, %.6f ms/call average\n", + total, per_call); + printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n", + max_bw, gbps); + } + +#if (CUBE_MODE == CUBE_DEFAULT) + + // Only compare CPU and GPU X engines if dumping GPU X engine exactly once + if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) { + xgpuReorderMatrix(cuda_matrix_h); + xgpuCheckResult(cuda_matrix_h, omp_matrix_h, verbose, array_h); + } + +#if 0 + int fullMatLength = nfrequency * nstation*nstation*npol*npol; + Complex *full_matrix_h = (Complex *) malloc(fullMatLength*sizeof(Complex)); + + // convert from packed triangular to full matrix + xgpuExtractMatrix(full_matrix_h, cuda_matrix_h); + + free(full_matrix_h); +#endif +#endif + +cleanup: + //free host memory + free(omp_matrix_h); + + // free gpu memory + xgpuFree(&context); + cudaFree(array_hd); + +#ifdef DP4A + free(array_h); +#endif + + /* if(hostAlloc) { + free(context.array_h); + free(context.matrix_h); + }*/ + + return xgpu_error; +} diff --git a/legacy/dsaX_cutlass_interface.cu b/legacy/dsaX_cutlass_interface.cu new file mode 100644 index 0000000..fc68d55 --- /dev/null +++ b/legacy/dsaX_cutlass_interface.cu @@ -0,0 +1,315 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "dsaX_cutlass_interface.h" + +DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); +} + +// DMH: Replace this with data from DSA-FTD +void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { + + if(testing) { + uint64_t seed = 1234; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } else { + // DMH: construct DSA-FTD interface data transfer interface + } + + ptr_A = tensor_A.get(); + ptr_B = tensor_B.get(); + ptr_C = tensor_C.get(); + ptr_D = tensor_D.get(); + + batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + +} + +Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { + + Result result; + + initialize(); + + // Configure pointers in global memory + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + cudaError_t error; + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + } + } + + + cudaEvent_t events[2]; + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Run profiling loop + //------------------- + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + for (int iter = 0; iter < options.iterations; ++iter) { + + result.status = handle.gemm_planar_complex_array( + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // Compute reference in device code + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + // Define the GEMM through templates + GemmPlanarComplex + (problem_size, options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = BlockCompareRelativelyEqual + ( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) std::cout << "Reference check passed." << std::endl; + else std::cerr << "Error - reference check failed." << std::endl; + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; +} + + int main(int argc, char const **args) { + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + Options options; + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Compute GEMM + DSA_FTD_ComplexGEMM_CUTLASS gemm(options); + gemm.testing = true; + Result result = gemm.run(options); + + return result.passed ? 0 : -1; +} + diff --git a/legacy/dsaX_cutlass_interface.cu~ b/legacy/dsaX_cutlass_interface.cu~ new file mode 100644 index 0000000..a51d5a2 --- /dev/null +++ b/legacy/dsaX_cutlass_interface.cu~ @@ -0,0 +1,315 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "dsaX_cutlass_interface.h" + +DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); +} + +// DMH: Replace this with data from DSA-FTD +void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { + + if(testing) { + uint64_t seed = 1234; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } else { + // DMH: construct DSA-FTD interface data transfer interface + } + + ptr_A = tensor_A.get(); + ptr_B = tensor_B.get(); + ptr_C = tensor_C.get(); + ptr_D = tensor_D.get(); + + batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + +} + +Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { + + Result result; + + initialize(); + + // Configure pointers in global memory + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + cudaError_t error; + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + } + } + + + cudaEvent_t events[2]; + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Run profiling loop + //------------------- + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + for (int iter = 0; iter < options.iterations; ++iter) { + + result.status = handle.gemm_planar_complex_array( + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // Compute reference in device code + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + // Define the GEMM through templates + GemmPlanarComplex + (problem_size, options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = BlockCompareRelativelyEqual + ( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) std::cout << "Reference check passed." << std::endl; + else std::cerr << "Error - reference check failed." << std::endl; + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; +} + + int main(int argc, char const **args) { + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + Options options; + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Compute GEMM + testing = true; + DSA_FTD_ComplexGEMM_CUTLASS gemm(options); + Result result = gemm.run(options); + + return result.passed ? 0 : -1; +} + diff --git a/legacy/dsaX_cutlass_interface.h b/legacy/dsaX_cutlass_interface.h new file mode 100644 index 0000000..5aa753e --- /dev/null +++ b/legacy/dsaX_cutlass_interface.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/library/handle.h" + +using namespace cutlass; +using namespace gemm; +using namespace library; +using namespace layout; +using namespace reference; +using namespace device; + +// Result structure +struct Result { + + double runtime_ms; + double gflops; + Status status; + cudaError_t error; + bool passed; + + Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +// Command line options parsing (testing) +struct Options { + + bool help; + GemmCoord problem_size; + int batch_count; + complex alpha; + complex beta; + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(false), + iterations(20), + alpha(1), + beta() { } + + // Parses the command line + void parse(int argc, char const **args) { + + CommandLine cmd(argc, args); + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "dsaX_cutlass_interface\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/// Performance test environment for planar complex +class DSA_FTD_ComplexGEMM_CUTLASS { + + // Half-precision input and output + using Element = half_t; + + // Configurations for layouts and internal computation + using LayoutA = ColumnMajor; + using LayoutB = ColumnMajor; + using LayoutC = ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + Handle handle; + + GemmCoord problem_size; + int batch_count; + DeviceAllocation tensor_A; + DeviceAllocation tensor_B; + DeviceAllocation tensor_C; + DeviceAllocation tensor_D; + DeviceAllocation tensor_D_ref; + + DeviceAllocation ptr_A_real; + DeviceAllocation ptr_A_imag; + DeviceAllocation ptr_B_real; + DeviceAllocation ptr_B_imag; + DeviceAllocation ptr_C_real; + DeviceAllocation ptr_C_imag; + DeviceAllocation ptr_D_real; + DeviceAllocation ptr_D_imag; + + Element *ptr_A; + Element *ptr_B; + Element *ptr_C; + Element *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + typename LayoutA::Stride::Index lda; + typename LayoutB::Stride::Index ldb; + typename LayoutC::Stride::Index ldc; + typename LayoutC::Stride::Index ldd; + + int64_t imag_stride_A; + int64_t imag_stride_B; + int64_t imag_stride_C; + int64_t imag_stride_D; + +public: + // Constructors + DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); + DSA_FTD_ComplexGEMM_CUTLASS(); + + // Methods + void initialize(); + Result run(Options const &options); + + bool testing; +}; + diff --git a/legacy/dsaX_cutlass_interface.h~ b/legacy/dsaX_cutlass_interface.h~ new file mode 100644 index 0000000..42a3e8a --- /dev/null +++ b/legacy/dsaX_cutlass_interface.h~ @@ -0,0 +1,174 @@ +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/library/handle.h" + +using namespace cutlass; +using namespace gemm; +using namespace library; +using namespace layout; +using namespace reference; +using namespace device; + +// Result structure +struct Result { + + double runtime_ms; + double gflops; + Status status; + cudaError_t error; + bool passed; + + Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +// Command line options parsing (testing) +struct Options { + + bool help; + GemmCoord problem_size; + int batch_count; + complex alpha; + complex beta; + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(false), + iterations(20), + alpha(1), + beta() { } + + // Parses the command line + void parse(int argc, char const **args) { + + CommandLine cmd(argc, args); + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "dsaX_cutlass_interface\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/// Performance test environment for planar complex +class DSA_FTD_ComplexGEMM_CUTLASS { + + // Half-precision input and output + using Element = half_t; + + // Configurations for layouts and internal computation + using LayoutA = ColumnMajor; + using LayoutB = ColumnMajor; + using LayoutC = ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + Handle handle; + + GemmCoord problem_size; + int batch_count; + DeviceAllocation tensor_A; + DeviceAllocation tensor_B; + DeviceAllocation tensor_C; + DeviceAllocation tensor_D; + DeviceAllocation tensor_D_ref; + + DeviceAllocation ptr_A_real; + DeviceAllocation ptr_A_imag; + DeviceAllocation ptr_B_real; + DeviceAllocation ptr_B_imag; + DeviceAllocation ptr_C_real; + DeviceAllocation ptr_C_imag; + DeviceAllocation ptr_D_real; + DeviceAllocation ptr_D_imag; + + Element *ptr_A; + Element *ptr_B; + Element *ptr_C; + Element *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + typename LayoutA::Stride::Index lda; + typename LayoutB::Stride::Index ldb; + typename LayoutC::Stride::Index ldc; + typename LayoutC::Stride::Index ldd; + + int64_t imag_stride_A; + int64_t imag_stride_B; + int64_t imag_stride_C; + int64_t imag_stride_D; + + bool testing; + +public: + // Constructors + DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); + DSA_FTD_ComplexGEMM_CUTLASS(); + + // Methods + void initialize(); + Result run(Options const &options); + + +}; + diff --git a/legacy/dsaX_dbnic.c b/legacy/dsaX_dbnic.c new file mode 100644 index 0000000..83e3e4a --- /dev/null +++ b/legacy/dsaX_dbnic.c @@ -0,0 +1,435 @@ +/* simple nicdb + +will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now + +*/ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + + +// data to pass to threads +struct data { + char * out; + int sockfd; + struct sockaddr_in si_other; + int thread_id; + int chgroup; + int tseq; +}; + +/* global variables */ +int DEBUG = 0; +int TEST = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_dbnic [options]\n" + " -c core bind process to CPU core [no default]\n" + " -g chgroup [default 0]\n" + " -d send debug messages to syslog\n" + " -t TEST\n" + " -i in_key [default BF_BLOCK_KEY]\n" + " -w -x -y -z four ip addresses for corner turn\n" + " -h print usage\n"); +} + +/* thread for data transmission */ +void * transmit(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int sockfd = d->sockfd; + struct sockaddr_in si_other = d->si_other; + char * output = (char *)(d->out); + int chgroup = d->chgroup; + int tseq = d->tseq; + char * packet = (char *)malloc(sizeof(char)*P_SIZE); + int * ipacket = (int *)(packet); + + + // for test packet + if (tseq==-1) { + + ipacket[0] = chgroup; + sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other)); + + } + else { + + // fill op, doing transpose + char * op = (char *)malloc(sizeof(char)*(NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); + //iop[0] = chgroup; + //iop[1] = tseq; + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size); + uint64_t bytes_read = 0; + char *block; + uint64_t written, block_id; + + + // set up + int observation_complete=0; + int blocks = 0; + int started = 0; + int nthreads = NBMS / NBEAMS_PER_BLOCK; + + + // create socket connections + int sockfd[nthreads]; + struct sockaddr_in servaddr[nthreads]; + + for (int i=0;idata_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); + + // put together args + for (int i=0; idata_block, bytes_read); + + } + + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + + +// data to pass to threads +struct data { + char * out; + int sockfd; + int thread_id; + int chgroup; + int tseq; +}; + +/* global variables */ +int DEBUG = 0; +int TEST = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_dbnic [options]\n" + " -c core bind process to CPU core [no default]\n" + " -g chgroup [default 0]\n" + " -d send debug messages to syslog\n" + " -t TEST\n" + " -i in_key [default BF_BLOCK_KEY]\n" + " -w -x -y -z four ip addresses for corner turn\n" + " -h print usage\n"); +} + +/* thread for data transmission */ +void * transmit(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int sockfd = d->sockfd; + char * output = (char *)(d->out); + char * op = (char *)malloc(sizeof(char)*(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); + int * iop = (int *)(op); + int chgroup = d->chgroup; + int tseq = d->tseq; + + // fill op, doing transpose + iop[0] = chgroup; + iop[1] = tseq; + for (int i=0;i0) && (remain_data > 0)) { + remain_data -= sbytes; + sent_bytes += sbytes; + }*/ + sbytes = send(sockfd, op, remain_data, 0); + if (sbytes= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size); + uint64_t bytes_read = 0; + char *block; + uint64_t written, block_id; + + + // set up + int observation_complete=0; + int blocks = 0; + int started = 0; + int nthreads = NBMS / NBEAMS_PER_BLOCK; + + + // create socket connections + int sockfd[nthreads]; + struct sockaddr_in servaddr; + for (int i=0;idata_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); + + // put together args + for (int i=0; idata_block, bytes_read); + + } + + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -f file to read packet from [default none]\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + useZ = 0; + strcpy(fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + uint64_t npackets = block_out / 4608; + char * block, * output_buffer; + char * packet; + packet = (char *)malloc(sizeof(char)*4608); + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // fill output buffer if file exists + FILE *fin; + if (!useZ) { + + if (!(fin=fopen(fnam,"rb"))) { + syslog(LOG_ERR, "cannot open file - will write zeros"); + } + else { + + fread(packet,4608,1,fin); + fclose(fin); + + syslog(LOG_INFO,"Read packet, npackets %lu",npackets); + + for (int i=0;idata_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + // no need to do anything here - output_buffer is ready to go + + // write to output + written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(packet); + free(output_buffer); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_filTrigger.c b/legacy/dsaX_filTrigger.c new file mode 100644 index 0000000..55f95fd --- /dev/null +++ b/legacy/dsaX_filTrigger.c @@ -0,0 +1,559 @@ +/* Code to read from a single dada buffer, and write to disk upon receiving +a trigger. Uses pthread threads and shared memory to listen. +Sequence of events: + - starts null-reading dump buffer, while listening for socket command + + for N second dump, assume N-second dada blocks + - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. + - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dsaX_capture.h" +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" + +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +uint64_t specnum = 0; +uint64_t next_specnum = 0; +uint64_t procnum = 0; +int trignum = 0; +int dumpnum = 0; +char iP[100]; +char footer_buf[1024]; +char next_footer_buf[1024]; +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); +int dada_bind_thread_to_core (int core); + +FILE *output; + +void send_string(char *string) /* includefile */ +{ + int len; + len=strlen(string); + fwrite(&len, sizeof(int), 1, output); + fwrite(string, sizeof(char), len, output); +} + +void send_float(char *name,float floating_point) /* includefile */ +{ + send_string(name); + fwrite(&floating_point,sizeof(float),1,output); +} + +void send_double (char *name, double double_precision) /* includefile */ +{ + send_string(name); + fwrite(&double_precision,sizeof(double),1,output); +} + +void send_int(char *name, int integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(int),1,output); +} + +void send_char(char *name, char integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(char),1,output); +} + + +void send_long(char *name, long integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(long),1,output); +} + +void send_coords(double raj, double dej, double az, double za) /*includefile*/ +{ + if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); + if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); + if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); + if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); +} + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_filTrigger [options]\n" + " -c core bind process to CPU core\n" + " -i IP to listen to [no default]\n" + " -j in_key [default eaea]\n" + " -d debug\n" + " -n output file name base [no default]\n" + " -b beam number of first beam [default 0]\n" + " -z respond to zero specnum\n" + " -h print usage\n"); +} + + +// Thread to control the dumping of data + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = ctx->control_port; + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + char* tbuf = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,"11227",&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + strcpy(tbuf,buffer); + trignum++; + + // interpret buffer string + char * rest = buffer; + char tnam[100]; + tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0)); + strcpy(tnam,strtok_r(rest, "-", &rest)); + + if (!dump_pending) { + //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); + specnum = tmps/4; + strcpy(footer_buf,tnam); + syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf); + } + + if (dump_pending) { + syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam); + next_specnum = tmps/4; + strcpy(next_footer_buf,tnam); + } + + if (!dump_pending) dump_pending = 1; + + close(fd); + + } + + free (buffer); + free (tbuf); + + if (ctx->verbose) + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_filTrigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + + /* port for control commands */ + int control_port = TRIGGER_CONTROL_PORT; + + /* actual struct with info */ + udpdb_t udpdb; + + // input data block HDU key + key_t in_key = 0x0000eaea; + + // command line arguments + int core = -1; + int beamn = 0; + char of[200]; + char foutnam[300]; + char dirnam[300]; + int rz=0; + int arg=0; + + while ((arg=getopt(argc,argv,"i:c:j:db:n:hz")) != -1) + { + switch (arg) + { + case 'i': + strcpy(iP,optarg); + break; + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -c flag requires argument\n"); + return EXIT_FAILURE; + } + case 'b': + if (optarg) + { + beamn = atoi(optarg); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -b flag requires argument\n"); + return EXIT_FAILURE; + } + case 'n': + if (optarg) + { + strcpy(of,optarg); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -n flag requires argument\n"); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_INFO, "Will excrete all debug messages"); + break; + case 'z': + rz=1; + syslog (LOG_INFO, "Will respond to zero trigger"); + break; + case 'j': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // DADA stuff + + udpdb.verbose = DEBUG; + udpdb.control_port = control_port; + + // start control thread + int rval = 0; + pthread_t control_thread_id; + syslog(LOG_INFO, "starting control_thread()"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + + + syslog (LOG_INFO, "creating hdus"); + + // open connection to the in/read DBs + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer"); + return EXIT_FAILURE; + } + + // Bind to cpu core + if (core >= 0) + { + syslog(LOG_INFO,"binding to core %d", core); + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + } + + int observation_complete=0; + + // more DADA stuff - deal with headers + + uint64_t header_size = 0; + + // read the header from the input HDU + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "main: could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + // mark the input header as cleared + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared [input]"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + + // stuff for writing data + /* + Data will have [64 beam, time, freq] for each block. + Need to extract + */ + + + + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + unsigned char * extData = (unsigned char *)malloc(sizeof(unsigned char)*NSAMPS_PER_BLOCK*NCHAN_FIL*NBEAMS_PER_BLOCK); + uint64_t specs_per_block = NSAMPS_PER_BLOCK; + uint64_t current_specnum = 0; // updates with each dada block read + uint64_t start_byte, bytes_to_copy, bytes_copied=0; + char * in_data; + uint64_t written=0; + uint64_t block_id, bytes_read=0; + int dumping = 0; + FILE *ofile; + ofile = fopen("/home/ubuntu/data/dumps.dat","a"); + fprintf(ofile,"starting...\n"); + fclose(ofile); + + + // main reading loop + float pc_full = 0.; + + syslog(LOG_INFO, "main: starting observation"); + + while (!observation_complete) { + + // read a DADA block + in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // add delay + // only proceed if input data block is 80% full + while (pc_full < 0.8) { + pc_full = ipcio_percent_full(hdu_in->data_block); + usleep(100); + } + pc_full = 0.; + + + // check for dump_pending + if (dump_pending) { + + // look after hand trigger + if (specnum==0 && rz==1) { + + specnum = current_specnum + 40000; + + } + + // if this is the first block to dump + if (specnum > current_specnum && specnum < current_specnum+specs_per_block) { + + dumping = 1; + syslog(LOG_INFO,"dumping is 1 -- first block"); + + // loop over beams + bytes_to_copy = (NSAMPS_PER_BLOCK-(specnum-current_specnum))*NCHAN_FIL; + bytes_copied = bytes_to_copy; + for (int i=0;i current_specnum && specnum + NSAMPS_PER_BLOCK <= current_specnum + specs_per_block && dumping==1) { + + syslog(LOG_INFO,"in second block"); + + // loop over beams + bytes_to_copy = NSAMPS_PER_BLOCK*NCHAN_FIL-bytes_copied; + for (int i=0;idata_block, bytes_read); + + + } + + + // close control thread + syslog(LOG_INFO, "joining control_thread"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + + free(extData); + dsaX_dbgpu_cleanup (hdu_in); + +} diff --git a/legacy/dsaX_fluff.c b/legacy/dsaX_fluff.c new file mode 100644 index 0000000..3e3f2d1 --- /dev/null +++ b/legacy/dsaX_fluff.c @@ -0,0 +1,415 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +// data to pass to threads +struct data { + char * in; + char * out; + int n_threads; + int thread_id; + int debug; +}; + +/* global variables */ +int DEBUG = 0; +int cores[8] = {22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int dbg = d->debug; + int na = 64; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + char *out = (char *)d->out; + int nthreads = d->n_threads; + + // local array + int * fluffed_int = (int *)(in); + int * out_int = (int *)(out); + + // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose + int tile_size = 4; // set by benchmarking + for (int i_packet=NPACKETS*thread_id/nthreads;i_packet= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * blockie; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // set up data structure + for (int i=0; idata_block, &block_id); + memcpy(blockie, output_buffer, block_out); + ipcio_close_block_write(hdu_out->data_block, block_out); + + //written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + +} + + diff --git a/legacy/dsaX_makeFil.c b/legacy/dsaX_makeFil.c new file mode 100644 index 0000000..e9d6e3c --- /dev/null +++ b/legacy/dsaX_makeFil.c @@ -0,0 +1,276 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block; + uint64_t written, block_id; + + + // set up + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + // here is where we convert input voltage data to output filterbank data + + + // write to output dada block + written = ipcio_write (hdu_out->data_block, block, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_merge.c b/legacy/dsaX_merge.c new file mode 100644 index 0000000..7866d5f --- /dev/null +++ b/legacy/dsaX_merge.c @@ -0,0 +1,580 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +/* global variables */ +int DEBUG = 0; +int STATS = 0; +const int nth = 4; + +// data to pass to threads +struct data { + char * in; + char * in2; + char * out; + int * ant_order1; + int * ant_order2; + int n_threads; + int thread_id; +}; +int cores[4] = {17, 18, 37, 38}; + + +void * massage (void *args) { + + struct data *d = args; + int thread_id = d->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + // extract from input + char *in = (char *)d->in; + char *in2 = (char *)d->in2; + char *out = (char *)d->out; + int n_threads = d->n_threads; + int * ao1 = d->ant_order1; + int * ao2 = d->ant_order2; + + uint64_t oidx, iidx, ncpy = 1536; + + for (int i=thread_id*(2048/n_threads);i<(thread_id+1)*(2048/n_threads);i++) { + for (int j=0;j<3*NSNAPS/2;j++) { + iidx = i*(NSNAPS/2)*4608 + j*1536; + oidx = i*NSNAPS*4608 + ao1[j]*1536; + memcpy(out + oidx, in + iidx, ncpy); + oidx = i*NSNAPS*4608 + ao2[j]*1536; + memcpy(out + oidx, in2 + iidx, ncpy); + } + } + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_split [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -m multithread write\n" + " -i in_key\n" + " -o out_key\n" + " -j in_key2\n" + " -h print usage\n"); +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_merge", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_in2 = 0; + + // data block HDU keys + key_t in_key = CAPTURE_BLOCK_KEY; + key_t out_key = CAPTURED_BLOCK_KEY; + key_t in_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int arg = 0; + int mwrite = 0; + + while ((arg=getopt(argc,argv,"c:i:o:j:dmh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'j': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key2) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'm': + mwrite=1; + syslog (LOG_INFO, "Will do multithread write"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + hdu_in2 = dada_hdu_create (0); + dada_hdu_set_key (hdu_in2, in_key2); + if (dada_hdu_connect (hdu_in2) < 0) { + syslog (LOG_ERR,"could not connect to input buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read(hdu_in2) < 0) { + syslog (LOG_ERR, "could not lock to input buffer2"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + header_in = ipcbuf_get_next_read (hdu_in2->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in2->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // sort out ant order + int * ao1, * ao2; + ao1 = (int *)malloc(sizeof(int)*48); + ao2 = (int *)malloc(sizeof(int)*48); + ao1[0] = 19; + ao1[1] = 20; + ao1[2] = 21; + ao1[3] = 25; + ao1[4] = 26; + ao1[5] = 27; + ao1[6] = 18; + ao1[7] = 17; + ao1[8] = 16; + ao1[9] = 12; + ao1[10] = 11; + ao1[11] = 45; + ao1[12] = 83; + ao1[13] = 10; + ao1[14] = 9; + ao1[15] = 6; + ao1[16] = 5; + ao1[17] = 4; + ao1[18] = 0; + ao1[19] = 84; + ao1[20] = 85; + ao1[21] = 89; + ao1[22] = 90; + ao1[23] = 91; + ao1[24] = 39; + ao1[25] = 40; + ao1[26] = 41; + ao1[27] = 33; + ao1[28] = 34; + ao1[29] = 35; + ao1[30] = 42; + ao1[31] = 43; + ao1[32] = 44; + ao1[33] = 51; + ao1[34] = 52; + ao1[35] = 53; + ao1[36] = 57; + ao1[37] = 58; + ao1[38] = 59; + ao1[39] = 63; + ao1[40] = 64; + ao1[41] = 65; + ao1[42] = 69; + ao1[43] = 70; + ao1[44] = 71; + ao1[45] = 75; + ao1[46] = 76; + ao1[47] = 77; + ao2[0] = 22; + ao2[1] = 23; + ao2[2] = 24; + ao2[3] = 28; + ao2[4] = 29; + ao2[5] = 30; + ao2[6] = 15; + ao2[7] = 14; + ao2[8] = 13; + ao2[9] = 46; + ao2[10] = 47; + ao2[11] = 48; + ao2[12] = 82; + ao2[13] = 8; + ao2[14] = 7; + ao2[15] = 3; + ao2[16] = 2; + ao2[17] = 1; + ao2[18] = 86; + ao2[19] = 87; + ao2[20] = 88; + ao2[21] = 92; + ao2[22] = 93; + ao2[23] = 94; + ao2[24] = 95; + ao2[25] = 31; + ao2[26] = 32; + ao2[27] = 36; + ao2[28] = 37; + ao2[29] = 38; + ao2[30] = 81; + ao2[31] = 49; + ao2[32] = 50; + ao2[33] = 54; + ao2[34] = 55; + ao2[35] = 56; + ao2[36] = 60; + ao2[37] = 61; + ao2[38] = 62; + ao2[39] = 66; + ao2[40] = 67; + ao2[41] = 68; + ao2[42] = 72; + ao2[43] = 73; + ao2[44] = 74; + ao2[45] = 78; + ao2[46] = 79; + ao2[47] = 80; + + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block1, * block2, * o1, * o2; + char * output = (char *)malloc(sizeof(char)*block_out); + uint64_t written, block_id; + + // set up threads + struct data args[8]; + pthread_t threads[8]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // send through fake blocks + + /* if (fake>0) { + syslog(LOG_INFO,"sending %d fake blocks",fake); + for (int i=0;idata_block, &block_id); + memcpy(o1, output, block_out); + ipcio_close_block_write (hdu_out->data_block, block_out); + usleep(10000); + } + syslog(LOG_INFO,"Finished with fake blocks"); + }*/ + + + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + + block1 = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + block2 = ipcio_open_block_read (hdu_in2->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + + // DO STUFF + + // copy to output buffer + + if (mwrite) { + o1 = ipcio_open_block_write (hdu_out->data_block, &block_id); + } + + // set up data structure + for (int i=0; idata_block, output, block_out); + } + else { + ipcio_close_block_write (hdu_out->data_block, block_out); + } + + if (blocks % 10 == 0) + syslog(LOG_INFO, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + ipcio_close_block_read (hdu_in2->data_block, bytes_read); + + } + + free(output); + free(ao1); + free(ao2); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_in2,0); + dsaX_dbgpu_cleanup (hdu_out,1); + +} + + diff --git a/legacy/dsaX_nicdb.c b/legacy/dsaX_nicdb.c new file mode 100644 index 0000000..df47ebe --- /dev/null +++ b/legacy/dsaX_nicdb.c @@ -0,0 +1,483 @@ +/* +https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading + +gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran + +the plan is to have NCLIENTS threads listening on different threads. +each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints +the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly +The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. + +After a block is full, the data need to be written out (data rate 525 Mb/s) +The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. +switch block when one block is being written out + +*/ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#define bdepth 16 +#define MAX_FULLBLOCK 4 + +// global variables +int DEBUG = 0; +volatile int blockct[bdepth]; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW +volatile int flush_flag = 0; // set to flush output2 +volatile int writing = 0; +volatile int global_tseq = 0; // global count of full buffers +int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to +char iP[100]; +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + +// structure to pass to threads +struct data +{ + char * output1; + char * output2; + uint16_t tport; + int thread_id; +}; + +// function prototypes +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + + +// receive process - runs infinite loop +void * process(void * ptr) +{ + + // arguments from structure + struct data *d = ptr; + int thread_id = d->thread_id; + char *output1 = (char *)d->output1; + char *output2 = (char *)d->output2; + uint16_t tport = d->tport; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); + + // set up socket + struct sockaddr_in si_other, si_me; + int clientSocket, slen=sizeof(si_other); + clientSocket=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (DEBUG) syslog(LOG_INFO,"thread %d: Made socket",thread_id); + memset((char *) &si_me, 0, sizeof(si_me)); + si_me.sin_family = AF_INET; + si_me.sin_port = htons(tport); + si_me.sin_addr.s_addr = inet_addr(iP); + if (bind(clientSocket, (struct sockaddr *)&si_me, sizeof(si_me)) < 0) { + syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id); + exit(1); + } + if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound - waiting for header packet",thread_id); + + char * packet = (char *)malloc(sizeof(char)*P_SIZE); + int * ibuf; + recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen); + ibuf = (int *)(packet); + int chgroup = ibuf[0]; + syslog(LOG_INFO,"thread %d: accepted connection from chgroup %d",thread_id,chgroup); + + // data buffer and other variables + char * buffer = (char *)malloc((NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); + int tseq, pseq; + int pct = 0; + int full_blocks = 0; + int fullBlock; + int i0, aa; + int lastPacket, nextBuf, current_tseq = 0, act_tseq; + uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; + uint64_t oidx_offset, oidx; + + // infinite loop + while (1) { + + /* read message */ + // fill up local buffer + lastPacket = 0; + nextBuf = 0; + while ((lastPacket==0) && (nextBuf==0)) { + + recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen); + ibuf = (int *)(packet); + pseq = ibuf[2]; + if (chgroup != ibuf[0]) + syslog(LOG_ERR,"thread %d: received chgroup %d is not recorded %d",thread_id,ibuf[0],chgroup); + tseq = ibuf[1]; + + if (tseq>current_tseq) { + nextBuf=1; + } + else if (tseq==current_tseq) { + memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12); + pct++; + } + + if (pseq==NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)-1) + lastPacket=1; + + } + + if (pct != NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)) + syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,pct,NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)); + + act_tseq = (current_tseq * NSAMPS_PER_TRANSMIT) % NSAMPS_PER_BLOCK; // place within output buffer + + // at this stage we have a full local buffer + // this needs to be placed in the global buffer + + // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs + i0 = 0; + aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth); + oidx_offset = ((uint64_t)(aa))*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; + //syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d current_tseq %d global_tseq %d position %d %"PRIu64"",thread_id,chgroup,tseq,current_tseq,global_tseq,aa,oidx_offset); + for (int i=0;i=MAX_FULLBLOCK && blockct[i] >= (NCLIENTS-1)*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) { + + // need to write this block and reset blockct + while (flush_flag==1) + aa==1; + flush_flag = 1; + blockct[i] = 0; + // log - hardcoded bdepth + full_blocks -= 1; + syslog(LOG_INFO,"thread %d: Writing global_tseq %d. Blockcts_full %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",thread_id,global_tseq,full_blocks,blockct[0],blockct[1],blockct[2],blockct[3],blockct[4],blockct[5],blockct[6],blockct[7],blockct[8],blockct[9],blockct[10],blockct[11],blockct[12],blockct[13],blockct[14],blockct[15]); + + + } + + } + + pthread_mutex_unlock(&mutex); + + // advance local tseq and deal with packet capture + if (lastPacket==1) { + current_tseq++; + lastPacket=0; + nextBuf=0; + pct=0; + } + if (nextBuf==1) { + current_tseq++; + memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12); + pct=1; + lastPacket=0; + } + + + + } + + /* close socket and clean up */ + close(clientSocket); + free(packet); + free(buffer); + pthread_exit(0); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_nicdb [options]\n" + " -c core bind process to CPU core [no default]\n" + " -f header file [no default]\n" + " -d send debug messages to syslog\n" + " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n" + " -i IP address\n" + " -h print usage\n"); +} + + +// main part of program +int main(int argc, char ** argv) +{ + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // threads + struct data args[16]; + pthread_t threads[16]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + // deal with headers + uint64_t header_size = 4096; + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + FILE *fin; + if (!(fin=fopen(fnam,"rb"))) { + syslog(LOG_ERR,"cannot open dada header file %s",fnam); + return EXIT_FAILURE; + } + fread(header_out, 4096, 1, fin); + fclose(fin); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out); + uint64_t bytes_read = 0; + char *output1, *output2; + output1 = (char *)malloc(sizeof(char)*block_out*bdepth); + output2 = (char *)malloc(sizeof(char)*block_out); + memset(output1,0,block_out*bdepth); + memset(output2,0,block_out); + uint64_t written, block_id; + + // set up threads + + // set up data structure + for (int i=0; idata_block, output1 + (global_tseq % bdepth)*block_out, block_out); + global_tseq += 1; + writing=0; + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + syslog(LOG_INFO, "written block %d",blocks); + blocks++; + + flush_flag = 0; + + } + + + // free stuff + for(int i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; +int blockct = 0; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW +int block_switch = 0; // 0 means write to output1, write out output2. +int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to +char iP[100]; + +// structure to pass to threads +struct data +{ + char * output1; + char * output2; + uint16_t tport; + int thread_id; +}; + +// function prototypes +void dsaX_dbgpu_cleanup (dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * out) +{ + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + + +// receive process - runs infinite loop +void * process(void * ptr) +{ + + // arguments from structure + struct data *d = ptr; + int thread_id = d->thread_id; + char *output1 = (char *)d->output1; + char *output2 = (char *)d->output2; + uint16_t tport = d->tport; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); + + // set up socket + int sock = -1, conn = -1; + struct sockaddr_in address, cli; + + /* create socket */ + sock = socket(AF_INET, SOCK_STREAM, 0); + if (DEBUG) syslog(LOG_INFO,"thread %d: opened socket",thread_id); + memset(&address, 0, sizeof(struct sockaddr_in)); + address.sin_family = AF_INET; + inet_pton(AF_INET, iP, &(address.sin_addr)); + //address.sin_addr.s_addr = inet_addr("127.0.0.1"); + address.sin_port = htons(tport); + if (DEBUG) syslog(LOG_INFO,"thread %d: socket ready",thread_id); + if (bind(sock, (struct sockaddr *)&address, sizeof(struct sockaddr_in)) < 0) { + syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id); + exit(1); + } + if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound",thread_id); + listen(sock, 5); + if (DEBUG) syslog(LOG_INFO,"thread %d: socket listening on port %d",thread_id,tport); + + // accept connection + socklen_t cli_len=sizeof(struct sockaddr); + conn = accept(sock, (struct sockaddr *) &cli, &cli_len); + if (conn<0) { + syslog(LOG_ERR,"thread %d: error accepting connection",thread_id); + exit(1); + } + syslog(LOG_INFO,"thread %d: accepted connection",thread_id); + + // data buffer and other variables + char * buffer = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); + char * dblock = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); + int *ibuf, chgroup, tseq, oidx, iidx; + int remain_data, outptr, len; + int i0; + + // infinite loop + while (1) { + + /* read message */ + // read to buffer until all is read + remain_data =(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW); + outptr=0; + + /* + while (((len = recv(conn, dblock, remain_data, 0)) > 0) && (remain_data > 0)) { + memcpy(buffer+outptr, dblock, len); + remain_data -= len; + outptr += len; + //syslog(LOG_INFO,"Received %d of %d bytes",outptr,8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW); + }*/ + //recvlen = read(sock, buffer, sizeof(buffer)); + ibuf = (int *)(buffer); + len = recv(conn, dblock, remain_data, MSG_WAITALL); + memcpy(buffer, dblock, len); + remain_data -= len; + if (remain_data != 0) + syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,len,(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); + + if (remain_data==0) { + + // get channel group and time sequence + chgroup = ibuf[0]; // from 0-15 + tseq = ibuf[1]; // continuous iterate over transmits + if (DEBUG) syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d blockct %d",thread_id,chgroup,tseq,blockct); + tseq = (tseq * 128) % 4096; // place within output + + // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs + i0 = 8; + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + // deal with headers + uint64_t header_size = 4096; + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + FILE *fin; + if (!(fin=fopen(fnam,"rb"))) { + syslog(LOG_ERR,"cannot open dada header file %s",fnam); + return EXIT_FAILURE; + } + fread(header_out, 4096, 1, fin); + fclose(fin); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out); + uint64_t bytes_read = 0; + char *output1, *output2; + output1 = (char *)malloc(sizeof(char)*block_out); + output2 = (char *)malloc(sizeof(char)*block_out); + memset(output1,0,block_out); + memset(output2,0,block_out); + uint64_t written, block_id; + + // set up threads + + // set up data structure + for (int i=0; i=NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) { + + // change output + bswitch= block_switch; + blockct=0; + if (bswitch==0) block_switch=1; + if (bswitch==1) block_switch=0; + + // write to output + if (bswitch==0) written = ipcio_write (hdu_out->data_block, output1, block_out); + if (bswitch==1) written = ipcio_write (hdu_out->data_block, output2, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + ctt=0; + } + + } + + // free stuff + for(int i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +// data to pass to threads +struct data { + char * in; + char * out; + int n_threads; + int thread_id; + int debug; +}; + +/* global variables */ +int DEBUG = 0; +int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int dbg = d->debug; + + // masks for fluffing + __m512i masks[4]; + masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL); + masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL); + masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL); + masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL); + + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + char *out = (char *)d->out; + int nthreads = d->n_threads; + + /* DO ALL PROCESSING + + "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) + "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i + parallelize by splitting on NPACKETS axis. + + */ + + // input and output index and extracted data + int idx = thread_id; // PACKET idx for input and output + char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data + char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data + + // extract data + memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); + if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: extracted data",thread_id); + + // do fluffing + + /* + technique is to use nybble masks to + (a) unmask every fourth nybble + (b) bit shift to left using mm512_slli_epi16 + (c) sign extend by 4 bits using mm512_srai_epi16 + (d) bit shift to right + + Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data + + */ + + // variables + char * low = (char *)malloc(sizeof(char)*64); // m512 + char * hi = (char *)malloc(sizeof(char)*64); // m512 + __m512i low_m, hi_m; + unsigned short * low_u = (unsigned short *)(low); + unsigned short * hi_u = (unsigned short *)(hi); + __m512i v[4]; // for 4 packed 4-bit numbers + + // input and output + __m512i proc_m; + unsigned short * fluffed_u = (unsigned short *)(fluffed_data); + + // numbers to iterate over + int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64; + + if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: ready to fluff",thread_id); + + // let's do it! + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + return EXIT_FAILURE; + } + + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // set up data structure + for (int i=0; idata_block, output_buffer, block_out); + + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + +} + + diff --git a/legacy/dsaX_reorder_raw.c b/legacy/dsaX_reorder_raw.c new file mode 100644 index 0000000..c0f6b0c --- /dev/null +++ b/legacy/dsaX_reorder_raw.c @@ -0,0 +1,613 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +// Forward declaration to keep compiler happy +// Possible minor bug in PSRDada +int ipcio_check_pending_sod (ipcio_t* ); +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +// data to pass to threads +struct data { + char * in; + char * out; + int n_threads; + int thread_id; + int debug; + int write; + ipcio_t * ipc; +}; + +/* global variables */ +int DEBUG = 0; +int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -b connect to bf hdu\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int na = 64; // output ants + int dbg = d->debug; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + char *out = (char *)d->out; + int nthreads = d->n_threads; + + /* DO ALL PROCESSING + + "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) + "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i + parallelize by splitting on NPACKETS axis. + + */ + + // input and output index and extracted data + int idx = thread_id; // PACKET idx for input and output + //char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data + //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data + //char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data + + // extract data + //memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); + + // do fluffing in dumbest possible way + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); + + // let's do it! + int in_idx, out_idx, a1, a2, a3, a4, a5, a6; + int in_offset = idx*(NPACKETS/nthreads)*NANTS*(384*2)*2; + int out_offset = idx*(NPACKETS/nthreads)*(384*2)*na*2; + for (int i=0;i<(NPACKETS/nthreads);i++) { + a1 = i*NANTS*1536; + a2 = i*na*1536; + for (int j=0;jipc->curbuf[out_offset+out_idx] = in[in_offset+in_idx]; + //d->ipc->curbuf[out_offset+2*out_idx+1] = in[in_offset+in_idx] >> 4; + + } + } + } + } + + /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data + + fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4; + fluffed_data[2*i+1] = proc_data[i] >> 4; + + }*/ + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id); + + // transpose antennas and frequencies by ints + // from fluffed_data to out_data + /* int * fluffed_int = (int *)(fluffed_data); + memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2); + int * out_int = (int *)out_data;*/ + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id); + + // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose + /* int tile_size = 3; // set by benchmarking + for (int i_packet=0;i_packetwrite) + memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + else + memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + */ + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); + + // free stuff + //free(proc_data); + //free(fluffed_data); + //free(out_data); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // TESTING and initialization + // threads + struct data args[16]; + pthread_t threads[16]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // run test with single thread + + /*syslog(LOG_INFO,"Running TEST...\n"); + + // set up data structure + char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); + char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); + memset(test_block,0,sizeof(test_block)); + + TEST CODE + FILE *fin; + fin=fopen("../utils/packet.out","rb"); + fread(test_block, 96768, 1, fin); + fclose(fin); + END TEST CODE + + args[0].in = test_block; + args[0].out = test_output; + args[0].n_threads = 1; + args[0].thread_id = 0; + args[0].debug = 0; + args[0].write = 0; + + // run test thread + if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { + syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); + } + else + syslog(LOG_INFO,"Created TEST thread\n"); + pthread_attr_destroy(&attr); + pthread_join(threads[0], &result); + syslog(LOG_INFO,"joined TEST thread"); + + TEST CODE + fin=fopen("../utils/test.out","wb"); + fwrite(test_output, 1, 196608, fin); + fclose(fin); + END TEST CODE + + // clean up + free(test_block); + free(test_output); + + syslog(LOG_INFO,"TEST COMPLETE");*/ + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_out2 = 0; + + // data block HDU keys + key_t in_key = CAPTURED_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int nthreads = 1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + nthreads = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_INFO, "Will excrete all debug messages"); + break; + + case 'q': + syslog (LOG_INFO, "Quit here"); + return EXIT_SUCCESS; + + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + if (bf) { + hdu_out2 = dada_hdu_create (0); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer2"); + return EXIT_FAILURE; + } + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * blockie; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // sort out write + hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); + hdu_out->data_block->marked_filled = 0; + //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); + + // set up data structure + for (int i=0; idata_block; + args[i].write = 1; + } + + if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); + + for(int i=0; idata_block, output_buffer, block_out); + + if (bf) { + + written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + } + + // finish write + ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); + ipcio_check_pending_sod (hdu_out->data_block); + hdu_out->data_block->marked_filled = 1; + //ipcio_close_block_write(hdu_out->data_block, block_out); + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + +} + + diff --git a/legacy/dsaX_reorder_raw.c.bak b/legacy/dsaX_reorder_raw.c.bak new file mode 100644 index 0000000..0914823 --- /dev/null +++ b/legacy/dsaX_reorder_raw.c.bak @@ -0,0 +1,672 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +// data to pass to threads +struct data { + char * in; + char * out; + int n_threads; + int thread_id; + int debug; + int write; + ipcio_t * ipc; +}; + +/* global variables */ +int DEBUG = 0; +int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -b connect to bf hdu\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int na = 64; // output ants + int dbg = d->debug; + + // masks for fluffing + __m512i masks[4]; + masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL); + masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL); + masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL); + masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL); + + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + char *out = (char *)d->out; + int nthreads = d->n_threads; + + /* DO ALL PROCESSING + + "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) + "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i + parallelize by splitting on NPACKETS axis. + + */ + + // input and output index and extracted data + int idx = thread_id; // PACKET idx for input and output + char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data + char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data + char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data + + // extract data + memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); + + // do fluffing + + /* + technique is to use nybble masks to + (a) unmask every fourth nybble + (b) bit shift to left using mm512_slli_epi16 + (c) sign extend by 4 bits using mm512_srai_epi16 + (d) bit shift to right + + Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data + + */ + + // variables + char * low = (char *)malloc(sizeof(char)*64); // m512 + char * hi = (char *)malloc(sizeof(char)*64); // m512 + __m512i low_m, hi_m; + unsigned short * low_u = (unsigned short *)(low); + unsigned short * hi_u = (unsigned short *)(hi); + __m512i v[4]; // for 4 packed 4-bit numbers + + // input and output + __m512i proc_m; + unsigned short * fluffed_u = (unsigned short *)(fluffed_data); + + // numbers to iterate over + int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64; + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); + + // let's do it! + for (int i=0;iwrite) + memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + else + memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); + + // free stuff + free(proc_data); + free(fluffed_data); + free(out_data); + free(low); + free(hi); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // TESTING and initialization + // threads + struct data args[16]; + pthread_t threads[16]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // run test with single thread + + syslog(LOG_INFO,"Running TEST...\n"); + + // set up data structure + char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); + char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); + memset(test_block,0,sizeof(test_block)); + + /* TEST CODE + FILE *fin; + fin=fopen("../utils/packet.out","rb"); + fread(test_block, 96768, 1, fin); + fclose(fin); + END TEST CODE */ + + args[0].in = test_block; + args[0].out = test_output; + args[0].n_threads = 1; + args[0].thread_id = 0; + args[0].debug = 0; + args[0].write = 0; + + // run test thread + if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { + syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); + } + else + syslog(LOG_INFO,"Created TEST thread\n"); + pthread_attr_destroy(&attr); + pthread_join(threads[0], &result); + syslog(LOG_INFO,"joined TEST thread"); + + /* TEST CODE + fin=fopen("../utils/test.out","wb"); + fwrite(test_output, 1, 196608, fin); + fclose(fin); + END TEST CODE */ + + // clean up + free(test_block); + free(test_output); + + syslog(LOG_INFO,"TEST COMPLETE"); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_out2 = 0; + + // data block HDU keys + key_t in_key = CAPTURED_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int nthreads = 1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + nthreads = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_INFO, "Will excrete all debug messages"); + break; + + case 'q': + syslog (LOG_INFO, "Quit here"); + return EXIT_SUCCESS; + + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + if (bf) { + hdu_out2 = dada_hdu_create (); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer2"); + return EXIT_FAILURE; + } + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * blockie; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // sort out write + hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); + hdu_out->data_block->marked_filled = 0; + //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); + + // set up data structure + for (int i=0; idata_block; + args[i].write = 1; + } + + if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); + + for(int i=0; idata_block, output_buffer, block_out); + + if (bf) { + + written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + } + + // finish write + ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); + ipcio_check_pending_sod (hdu_out->data_block); + hdu_out->data_block->marked_filled = 1; + //ipcio_close_block_write(hdu_out->data_block, block_out); + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + +} + + diff --git a/legacy/dsaX_reorder_raw.c.bak2 b/legacy/dsaX_reorder_raw.c.bak2 new file mode 100644 index 0000000..54ad886 --- /dev/null +++ b/legacy/dsaX_reorder_raw.c.bak2 @@ -0,0 +1,608 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +// data to pass to threads +struct data { + char * in; + char * out; + int n_threads; + int thread_id; + int debug; + int write; + ipcio_t * ipc; +}; + +/* global variables */ +int DEBUG = 0; +int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -b connect to bf hdu\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + int na = 64; // output ants + int dbg = d->debug; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + char *out = (char *)d->out; + int nthreads = d->n_threads; + + /* DO ALL PROCESSING + + "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) + "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i + parallelize by splitting on NPACKETS axis. + + */ + + // input and output index and extracted data + int idx = thread_id; // PACKET idx for input and output + char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data + //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data + char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data + + // extract data + memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); + if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); + + // do fluffing in dumbest possible way + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); + + // let's do it! + int in_idx, out_idx, a1, a2, a3, a4, a5, a6; + for (int i=0;i<(NPACKETS/nthreads);i++) { + a1 = i*NANTS*1536; + a2 = i*na*1536; + for (int j=0;j> 4; + out_data[2*out_idx+1] = proc_data[in_idx] >> 4; + + } + } + } + } + + /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data + + fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4; + fluffed_data[2*i+1] = proc_data[i] >> 4; + + }*/ + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id); + + // transpose antennas and frequencies by ints + // from fluffed_data to out_data + /* int * fluffed_int = (int *)(fluffed_data); + memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2); + int * out_int = (int *)out_data;*/ + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id); + + // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose + /* int tile_size = 3; // set by benchmarking + for (int i_packet=0;i_packetwrite) + memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + else + memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); + + if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); + + // free stuff + free(proc_data); + //free(fluffed_data); + free(out_data); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // TESTING and initialization + // threads + struct data args[16]; + pthread_t threads[16]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // run test with single thread + + syslog(LOG_INFO,"Running TEST...\n"); + + // set up data structure + char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); + char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); + memset(test_block,0,sizeof(test_block)); + + /* TEST CODE + FILE *fin; + fin=fopen("../utils/packet.out","rb"); + fread(test_block, 96768, 1, fin); + fclose(fin); + END TEST CODE */ + + args[0].in = test_block; + args[0].out = test_output; + args[0].n_threads = 1; + args[0].thread_id = 0; + args[0].debug = 0; + args[0].write = 0; + + // run test thread + if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { + syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); + } + else + syslog(LOG_INFO,"Created TEST thread\n"); + pthread_attr_destroy(&attr); + pthread_join(threads[0], &result); + syslog(LOG_INFO,"joined TEST thread"); + + /* TEST CODE + fin=fopen("../utils/test.out","wb"); + fwrite(test_output, 1, 196608, fin); + fclose(fin); + END TEST CODE */ + + // clean up + free(test_block); + free(test_output); + + syslog(LOG_INFO,"TEST COMPLETE"); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_out2 = 0; + + // data block HDU keys + key_t in_key = CAPTURED_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int nthreads = 1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + nthreads = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_INFO, "Will excrete all debug messages"); + break; + + case 'q': + syslog (LOG_INFO, "Quit here"); + return EXIT_SUCCESS; + + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + if (bf) { + hdu_out2 = dada_hdu_create (); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer2"); + return EXIT_FAILURE; + } + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * blockie; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // sort out write + hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); + hdu_out->data_block->marked_filled = 0; + //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); + + // set up data structure + for (int i=0; idata_block; + args[i].write = 1; + } + + if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); + + for(int i=0; idata_block, output_buffer, block_out); + + if (bf) { + + written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + } + + // finish write + ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); + ipcio_check_pending_sod (hdu_out->data_block); + hdu_out->data_block->marked_filled = 1; + //ipcio_close_block_write(hdu_out->data_block, block_out); + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + +} + + diff --git a/legacy/dsaX_simplesplit.c b/legacy/dsaX_simplesplit.c new file mode 100644 index 0000000..7a80c7e --- /dev/null +++ b/legacy/dsaX_simplesplit.c @@ -0,0 +1,362 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +/* global variables */ +int DEBUG = 0; + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_split [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -b connect to bf hdu\n" + " -i in_key [default CAPTURE_BLOCK_KEY]\n" + " -o out_key [default CAPTURED_BLOCK_KEY]\n" + " -j out_key2 [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_simplesplit", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_out2 = 0; + + // data block HDU keys + key_t in_key = CAPTURE_BLOCK_KEY; + key_t out_key = CAPTURED_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:i:o:j:dbh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'j': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key2) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + if (bf) { + hdu_out2 = dada_hdu_create (0); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer2"); + return EXIT_FAILURE; + } + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * o1, * o2; + output_buffer = (char *)malloc(sizeof(char)*block_out); + char * output = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + + // DO STUFF + + + // copy to output buffer + memcpy(output_buffer, block, block_size); + + // do write + written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + if (bf) + written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + free(output); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + +} + + diff --git a/legacy/dsaX_splice.c b/legacy/dsaX_splice.c new file mode 100644 index 0000000..b91e665 --- /dev/null +++ b/legacy/dsaX_splice.c @@ -0,0 +1,201 @@ +/* This works pretty much like the trigger code. receives a control UDP message +to store some data for a fixed amount of time. +Message format: length(s)-NAME +Will ignore messages until data recording is over +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include + + +FILE *output; + +void send_string(char *string) /* includefile */ +{ + int len; + len=strlen(string); + fwrite(&len, sizeof(int), 1, output); + fwrite(string, sizeof(char), len, output); +} + +void send_float(char *name,float floating_point) /* includefile */ +{ + send_string(name); + fwrite(&floating_point,sizeof(float),1,output); +} + +void send_double (char *name, double double_precision) /* includefile */ +{ + send_string(name); + fwrite(&double_precision,sizeof(double),1,output); +} + +void send_int(char *name, int integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(int),1,output); +} + +void send_char(char *name, char integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(char),1,output); +} + + +void send_long(char *name, long integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(long),1,output); +} + +void send_coords(double raj, double dej, double az, double za) /*includefile*/ +{ + if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); + if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); + if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); + if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); +} + + +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +int trignum = 0; +int dumpnum = 0; +char iP[100]; +char srcnam[1024]; +float reclen; +int DEBUG = 0; + +void usage() +{ + fprintf (stdout, "dsaX_splice [16 files]\n"); +} + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_splice", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // set up input array + // 16 corrs, 3840 times, 256 beams, 48 chans + char * bigarr = (char *)malloc(sizeof(char)*16*3840*256*48); + char foutnam[200]; + + // read into input array + FILE *fin; + for (int i=1;i<17;i++) { + fin=fopen(argv[i],"rb"); + fread(bigarr+(i-1)*3840*256*48,3840*256*48,1,fin); + fclose(fin); + } + + // reorder bigarr + char * tarr = (char *)malloc(sizeof(char)*16*3840*256*48); + int oidx, iidx; + // order is beam, time, freq + for (int i=0;i<16;i++) { + for (int j=0;j<3840;j++) { + for (int k=0;k<256;k++) { + + iidx = i*3840*256*48 + j*256*48 + k*48; + oidx = k*3840*768 + j*768 + i*48; + memcpy(tarr + oidx, bigarr + iidx, 48); + + } + } + } + free(bigarr); + + // loop over beams and write out all filterbanks + for (int i=0;i<256;i++) { + + sprintf(foutnam,"/home/ubuntu/data/fb_%d.fil",i); + + if (!(output = fopen(foutnam,"wb"))) { + printf("Couldn't open output file\n"); + return 0; + } + + send_string("HEADER_START"); + send_string("source_name"); + sprintf(srcnam,"fb_%d",i); + send_string(srcnam); + send_int("machine_id",1); + send_int("telescope_id",82); + send_int("data_type",1); // filterbank data + send_double("fch1",1498.75); // THIS IS CHANNEL 0 :) + send_double("foff",-0.244140625); + send_int("nchans",768); + send_int("nbits",8); + send_double("tstart",55000.0); + send_double("tsamp",8.192e-6*8.*16.); + send_int("nifs",1); + send_string("HEADER_END"); + + fwrite(tarr + i*2949120,2949120,1,output); + fclose(output); + + } + + // write out full filterbank + sprintf(foutnam,"/home/ubuntu/data/fb_all.fil"); + + if (!(output = fopen(foutnam,"wb"))) { + printf("Couldn't open output file\n"); + return 0; + } + + send_string("HEADER_START"); + send_string("source_name"); + sprintf(srcnam,"fb_all"); + send_string(srcnam); + send_int("machine_id",1); + send_int("telescope_id",82); + send_int("data_type",1); // filterbank data + send_double("fch1",1498.75); // THIS IS CHANNEL 0 :) + send_double("foff",-0.244140625); + send_int("nchans",768); + send_int("nbits",8); + send_double("tstart",55000.0); + send_double("tsamp",8.192e-6*8.*16.); + send_int("nifs",1); + send_string("HEADER_END"); + + fwrite(tarr,16*3840*256*48,1,output); + fclose(output); + + + free(tarr); + +} diff --git a/legacy/dsaX_split.c b/legacy/dsaX_split.c new file mode 100644 index 0000000..1361e86 --- /dev/null +++ b/legacy/dsaX_split.c @@ -0,0 +1,601 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +/* global variables */ +int DEBUG = 0; +int STATS = 0; +const int nth = 4; + +// data to pass to threads +struct data { + char * in; + char * out; + char * out2; + int bf; + int reorder; + int n_threads; + int thread_id; +}; +int cores[8] = {10, 11, 12, 13, 14, 15, 16, 17}; + + +void * massage (void *args) { + + struct data *d = args; + int thread_id = d->thread_id; + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + // extract from input + char *in = (char *)d->in; + int bf = d->bf; + int reorder = d->reorder; + int n_threads = d->n_threads; + + if (!reorder) { + memcpy(d->out + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT); + if (bf) + memcpy(d->out2 + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT); + } + else { + + // block for transpose + int block = 16; + + for (int i=(int)(thread_id*(2048/n_threads));i<(int)((thread_id + 1)*2048/n_threads);i++) { // over time + for (int i1 = 0; i1 < 48; i1 += block) { + for(int j = 0; j < NANT; j++) { + for(int b = 0; b < block && i1 + b < 48; b++) { + memcpy(d->out + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32); + if (bf) memcpy(d->out2 + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32); + } + } + } + } + + } + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); +void reorder_block(char *block, char *output); +void calc_stats(char *block); + +// calculates rms for each pol from the first packet in each block. +// block has shape [2048 time, NANT antennas, 768 channels, 2 pol, r/i] +void calc_stats(char *input) { + + float rmss[NANT*2]; + int iidx; + for (int i=0;i> 4),2.); + rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 240))) >> 4),2.); + + } + } + } + + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + if (bf) { + hdu_out2 = dada_hdu_create (0); + dada_hdu_set_key (hdu_out2, out_key2); + if (dada_hdu_connect (hdu_out2) < 0) { + syslog (LOG_ERR,"could not connect to output buffer2"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out2) < 0) { + syslog (LOG_ERR, "could not lock to output buffer2"); + return EXIT_FAILURE; + } + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + uint64_t nints = block_size / block_out; + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * o1, * o2; + output_buffer = (char *)malloc(sizeof(char)*block_out); + char * output = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // set up threads + struct data args[8]; + pthread_t threads[8]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // send through fake blocks + + if (fake>0) { + syslog(LOG_INFO,"sending %d fake blocks",fake); + for (int i=0;idata_block, &block_id); + memcpy(o1, output, block_out); + ipcio_close_block_write (hdu_out->data_block, block_out); + usleep(10000); + } + syslog(LOG_INFO,"Finished with fake blocks"); + } + + + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + + // DO STUFF + + for (int myint=0;myintdata_block, &block_id); + if (bf) o2 = ipcio_open_block_write (hdu_out2->data_block, &block_id); + } + + // stats + if (STATS) calc_stats(output_buffer); + + //if (reorder) { + + // set up data structure + for (int i=0; idata_block, output, block_out); + else + written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + + if (bf) { + written = ipcio_write (hdu_out->data_block, output_buffer, block_out); + if (reorder) + written = ipcio_write (hdu_out2->data_block, output, block_out); + else + written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); + } + } + else { + ipcio_close_block_write (hdu_out->data_block, block_out); + if (bf) ipcio_close_block_write (hdu_out2->data_block, block_out); + } + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + } + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + free(output); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + +} + + diff --git a/legacy/dsaX_splitup.c b/legacy/dsaX_splitup.c new file mode 100644 index 0000000..32f055d --- /dev/null +++ b/legacy/dsaX_splitup.c @@ -0,0 +1,285 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_splitup", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + uint64_t nsplits = block_size/block_out; + char * block, * output_buffer; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // do multiple writes + + for (uint64_t i=0;idata_block, output_buffer, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + } + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_store.c b/legacy/dsaX_store.c new file mode 100644 index 0000000..849c27c --- /dev/null +++ b/legacy/dsaX_store.c @@ -0,0 +1,218 @@ +/* Code to read from a raw data buffer and write to disk */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_dbdisk [options]\n" + " -c core bind process to CPU core\n" + " -k in_key [default fafa]\n" + " -h print usage\n"); +} + + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_store", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + + // input data block HDU key + key_t in_key = 0x0000fafa; + + // command line arguments + uint64_t blocksize; + uint64_t bout = 32*NSNAPS*4608; // output block size - assume input is a multiple. + int core = -1; + int arg=0; + + while ((arg=getopt(argc,argv,"c:k:h")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + printf ("ERROR: -c flag requires argument\n"); + return EXIT_FAILURE; + } + case 'k': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-k flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // DADA stuff + + // open connection to the in/read DB + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to input buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"dsaX_correlator_copy: could not lock to input buffer"); + return EXIT_FAILURE; + } + + // Bind to cpu core + if (core >= 0) + { + syslog(LOG_INFO,"binding to core %d", core); + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"dsaX_correlator_copy: failed to bind to core %d",core); + } + + // more DADA stuff - deal with headers + + uint64_t header_size = 0; + + // read the header from the input HDU + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "main: could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + // mark the input header as cleared + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared [input]"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + int observation_complete=0; + + // stuff for writing data + blocksize = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + char * cpbuf = (char *)malloc(sizeof(char)*blocksize); + char * outbuf = (char *)malloc(sizeof(char)*bout); + int ngulps = (int)(blocksize/bout); + int gulp = 0, wseq = 0;; + char *in_data; + uint64_t written=0, written2=0; + uint64_t block_id, bytes_read=0; + FILE *fout; + char fnam[100]; + + + syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout); + + + // main reading loop + + syslog(LOG_INFO, "main: starting read"); + + while (!observation_complete) { + + // read a DADA block + in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + // copy + memcpy(cpbuf, in_data, blocksize); + syslog(LOG_INFO, "starting new write (seq %d)",wseq); + + // open file for writing + sprintf(fnam,"/home/ubuntu/data/fl_%d.out",wseq); + fout = fopen(fnam,"wb"); + for (gulp=0;gulpdata_block, bytes_read); + + } + + free(cpbuf); + free(outbuf); + dsaX_dbgpu_cleanup (hdu_in); + +} + diff --git a/legacy/dsaX_testdada.c b/legacy/dsaX_testdada.c new file mode 100644 index 0000000..bbe7640 --- /dev/null +++ b/legacy/dsaX_testdada.c @@ -0,0 +1,161 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" +#include "xgpu.h" + +// print fn +void print_arr(char *ptr, int len) { + printf("\n["); + for (int i = 0; i < len; i++) { + printf(" %08x,", ptr[i]); + } + printf(" ]\n"); +} + +// read and write functions + +int write_block(dada_hdu_t* hdu_in) { + + dada_hdu_lock_write(hdu_in); + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + char * data = (char *)malloc(sizeof(char)*block_size); + memset(data, 0, block_size); + ipcio_write (hdu_in->data_block, data, block_size); + free(data); + dada_hdu_unlock_write (hdu_in); + +} + +int read_block(dada_hdu_t* hdu_in) { + + dada_hdu_lock_read(hdu_in); + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + char * data = (char *)malloc(sizeof(char)*block_size); + char * block; + uint64_t bytes_read, block_id; + + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + memcpy(data, block, bytes_read); + print_arr(data, (int)(bytes_read)); + + free(data); + ipcio_close_block_read (hdu_in->data_block, bytes_read); + dada_hdu_unlock_read (hdu_in); + +} + + + +// MAIN + +int main (int argc, char *argv[]) { + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + + // command line arguments + int arg = 0; + char *hout; + hout = (char *)malloc(sizeof(char)*4096); + + + while ((arg=getopt(argc,argv,"i:h:")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + sscanf (optarg, "%x", &in_key); + break; + } + case 'h': + if (optarg) + { + fileread (optarg, hout, 4096); + break; + } + } + } + + // DADA stuff + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + dada_hdu_connect (hdu_in); + + /* + // deal with header + dada_hdu_lock_write(hdu_in); + char * header_out = ipcbuf_get_next_write (hdu_in->header_block); + memcpy (header_out, hout, 4096); + ipcbuf_mark_filled (hdu_in->header_block, 4096); + dada_hdu_unlock_write(hdu_in); + free(hout); + + dada_hdu_lock_read(hdu_in); + uint64_t header_size = 0; + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + ipcbuf_mark_cleared (hdu_in->header_block); + dada_hdu_unlock_read(hdu_in); + */ + + // do four reads and four writes + + while (1) { + + printf("writing four blocks... "); + for (int i=0;i<4;i++) { + write_block(hdu_in); + sleep(0.5); + } + printf("written\n"); + + sleep(2); + + printf("reading four blocks... "); + for (int i=0;i<4;i++) { + read_block(hdu_in); + sleep(0.5); + } + printf("read\n"); + + } + +} + + diff --git a/legacy/dsaX_trigger.c b/legacy/dsaX_trigger.c new file mode 100644 index 0000000..9592389 --- /dev/null +++ b/legacy/dsaX_trigger.c @@ -0,0 +1,585 @@ +/* Code to read from a single dada buffer, and write to disk upon receiving +a trigger. Uses pthread threads and shared memory to listen. +Sequence of events: + - starts null-reading dump buffer, while listening for socket command + + for N second dump, assume N-second dada blocks + - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. + - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dsaX_capture.h" +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" + +// data to pass to threads +struct cdata { + char * in; + dada_hdu_t * hdu_out; +}; + + +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +uint64_t specnum = 0; +uint64_t procnum = 0; +int trignum = 0; +int dumpnum = 0; +char iP[100]; +char footer_buf[1024]; +int DEBUG = 0; +volatile int docopy = 0; +volatile int dumping = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_out"); + } + dada_hdu_destroy (out); + + + +} + +void usage() +{ + fprintf (stdout, + "dsaX_correlator_trigger [options]\n" + " -c core bind process to CPU core\n" + " -i IP to listen to [no default]\n" + " -j in_key [default eaea]\n" + " -o out_key [default fafa]\n" + " -d debug\n" + " -f full_pct [default 0.8]\n" + " -n output file name [no default]\n" + " -s skip N blocks [default 0]\n" + " -h print usage\n"); +} + +// thread to control writing of data to buffer + +void copy_thread (void * arg) { + + struct cdata *d = arg; + char *in = (char *)d->in; + dada_hdu_t * hdu_out = (dada_hdu_t *)d->hdu_out; + + uint64_t written = 0; + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO,"in thread... blocksize %"PRIu64"",block_size); + + while (1) { + + while (docopy==0) usleep(100); + + written = ipcio_write (hdu_out->data_block, in, block_size); + + dumping = 0; + dump_pending = 0; + docopy=0; + + syslog(LOG_INFO,"Finished writing trigger"); + + } + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + + +} + +// Thread to control the dumping of data + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = ctx->control_port; + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + char* tbuf = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,"11227",&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + uint64_t tmps; + char * token; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + memset(tbuf,0,bufsize); + strcpy(tbuf,buffer); + trignum++; + + // interpret buffer string + char * rest = buffer; + tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0)); + + if (!dump_pending) { + //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); + specnum = tmps; + strcpy(footer_buf,tbuf); + syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum); + } + + if (dump_pending) + syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps); + + if (!dump_pending) dump_pending = 1; + + close(fd); + + } + + free (buffer); + free (tbuf); + + if (ctx->verbose) + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_trigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + /* port for control commands */ + int control_port = TRIGGER_CONTROL_PORT; + + /* actual struct with info */ + udpdb_t udpdb; + + // input data block HDU key + key_t in_key = 0x0000eaea; + key_t out_key = 0x0000fafa; + + // command line arguments + int core = -1; + float full_pct = 0.8; + int arg=0; + int skips = 0; + + while ((arg=getopt(argc,argv,"i:c:j:o:f:d:s:h")) != -1) + { + switch (arg) + { + case 'i': + strcpy(iP,optarg); + break; + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -c flag requires argument\n"); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + full_pct = atof(optarg); + syslog(LOG_INFO,"Using full_pct %f",full_pct); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -f flag requires argument\n"); + return EXIT_FAILURE; + } + case 's': + if (optarg) + { + skips = atoi(optarg); + break; + } + else + { + syslog (LOG_ERR,"ERROR: -s flag requires argument\n"); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_INFO, "Will excrete all debug messages"); + break; + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'j': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-j flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // DADA stuff + + udpdb.verbose = DEBUG; + udpdb.control_port = control_port; + + // start control thread + int rval = 0; + pthread_t control_thread_id; + syslog(LOG_INFO, "starting control_thread()"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + + + syslog (LOG_INFO, "creating hdus"); + + // open connection to the in/read DBs + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + syslog (LOG_ERR,"could not lock4 to eada buffer"); + return EXIT_FAILURE; + } + + // Bind to cpu core + if (core >= 0) + { + syslog(LOG_INFO,"binding to core %d", core); + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + } + + int observation_complete=0; + + // more DADA stuff - deal with headers + + uint64_t header_size = 0; + + // read the header from the input HDU + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "main: could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // now write the output DADA header + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // copy the in header to the out header + memcpy (header_out, header_in, header_size); + + // mark the input header as cleared + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared [input]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // mark the output header buffer as filled + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // stuff for writing data + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + uint64_t specs_per_block = 2048; + uint64_t specs_per_out = 2048*NOUTBLOCKS; + uint64_t current_specnum = 0; // updates with each dada block read + uint64_t start_byte, bytes_to_copy, bytes_copied=0; + char * out_data = (char *)malloc(sizeof(char)*block_out); + char * in_data; + uint64_t written=0; + uint64_t block_id, bytes_read=0; + FILE *ofile; + ofile = fopen("/home/ubuntu/data/dumps.dat","w"); + fprintf(ofile,"starting...\n"); + fclose(ofile); + + + // thread for copying data + struct cdata cstruct; + cstruct.in = out_data; + cstruct.hdu_out = hdu_out; + rval = 0; + pthread_t copy_thread_id; + syslog(LOG_INFO, "starting copy_thread()"); + rval = pthread_create (©_thread_id, 0, (void *) copy_thread, (void *) &cstruct); + if (rval != 0) { + syslog(LOG_ERR, "Error creating copy_thread: %s", strerror(rval)); + return -1; + } + + + // main reading loop + float pc_full = 0.; + int block_count = 0; + syslog(LOG_INFO, "main: starting observation"); + + while (!observation_complete) { + + // read a DADA block + in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // add delay + // only proceed if input data block is 80% full + while (pc_full < full_pct) { + pc_full = ipcio_percent_full(hdu_in->data_block); + usleep(100); + } + pc_full = 0.; + + + // check for dump_pending + if (dump_pending) { + + // look after hand trigger + if (specnum==0) { + + specnum = current_specnum + 100; + + } + + // if this is the first block to dump + if (specnum >= current_specnum && specnum < current_specnum+specs_per_block) { + + dumping = 1; + + // find start byte and bytes to copy + start_byte = 4608*NSNAPS*(specnum-current_specnum); + bytes_to_copy = block_size-start_byte; + + // do copy + memcpy(out_data, in_data+start_byte, bytes_to_copy); + //written = ipcio_write (hdu_out->data_block, in_data+start_byte, bytes_to_copy); + bytes_copied = bytes_to_copy; + + } + + // if this is one of the middle blocks to dump from + if (specnum < current_specnum && specnum + specs_per_out > current_specnum + specs_per_block && dumping==1) { + + // do copy + memcpy(out_data + bytes_copied, in_data, block_size); + //written = ipcio_write (hdu_out->data_block, in_data, block_size); + bytes_copied += block_size; + + } + + // if this is the last block to dump from + if (specnum + specs_per_out > current_specnum && specnum + specs_per_out <= current_specnum + specs_per_block && dumping==1) { + + // find start byte and bytes to copy + bytes_to_copy = block_out-bytes_copied; + + // do copy + memcpy(out_data+bytes_copied, in_data, bytes_to_copy); + //written = ipcio_write (hdu_out->data_block, in_data, bytes_to_copy); + + // DO THE WRITING + /*written = ipcio_write (hdu_out->data_block, out_data, block_out); + + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + */ + + // DO writing using thread + docopy = 1; + + syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); + ofile = fopen("/home/ubuntu/data/dumps.dat","a"); + fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); + fclose(ofile); + + dumpnum++; + + // reset + bytes_copied = 0; + + } + + // if trigger arrived too late + if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) { + syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum); + + bytes_copied=0; + dump_pending=0; + + } + + + } + + // update current spec + syslog(LOG_INFO,"current_specnum %lu",current_specnum); + if (block_count < skips) { + block_count++; + } + else + current_specnum += specs_per_block; + + + // for exiting + if (bytes_read < block_size) { + observation_complete = 1; + syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size); + } + + // close block for reading + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + + } + + + // close threads + syslog(LOG_INFO, "joining control_thread"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + result=0; + pthread_join (copy_thread_id, &result); + + free(out_data); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} diff --git a/legacy/dsaX_wrangle b/legacy/dsaX_wrangle new file mode 100755 index 0000000000000000000000000000000000000000..f839b14c334758201c3b8885fb58a899eb6e804d GIT binary patch literal 99600 zcmeEvdtg-6@&C<>gaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_ z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z| z(<};uPnN~pEqRK^a#^}umM&+ehL+zpKb-~~KW0i=ev01rm-Bk{ddcpU zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;= zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>} z;A^&6gah&aTO9hiap=?H(7zG~zA6rU zS{(S&R_Wy9G9>_mO#*uSg9QscIKe9h_hLExU}IN-9dL zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_ z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM; z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq# zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=- z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8 zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg zYZmjf-T^^a=;IEz`GsrLmcq113tj2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6* z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_ zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V zlE0aGhNy^J^4BxZ&=kp({58xU#=IfzQZhiF8W-Qsx7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N ze-?R$fJnFGKVzOrqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~ zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^ zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT zV}Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8( zai zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3rsd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{ zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0# z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE z;G+aqAus$7u#3E{khk?mt|#&VAcKXBG}TiMu?!Azj}fl1 zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~ zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H zMzJ@Pas*QEGYDyBR`F|1vRTzEQc zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)mBF zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@ z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>kaANH99`*Z{vw+4#a+C4jn^R|1! zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mcawU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#zXO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7 zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^BE24?QEuNhTw$hV5 zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7 z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc z1`TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp z|0Fv`egG4xcCL?gAVB#dl9UeGZr^f7Js>{^F;Z*cl9 znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5 z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt zezsAuiP{ofwgf$|z#m1Zpr_Y+s;8s= zX-ld0yu28zwC7FgmoIYW^{`&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^ z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ ze)pJD2 zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0` zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?| zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|BY0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o z4U+j7r4uYBH>G_4Op3_kO z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xfPXn7!@ICCtW(kE? zC9u_OSs{<>4s2#9T4X&Cx<&N4oTpkZG(Ly zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R*LE83z!nesI@>8@N&BpGU-1t*vU&eE99vXkJn z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_* z=<(qn&}^?zwz+G8{h``ne8 zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf?? zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ovz;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg< z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d** zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~kQb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTxhaXjTHTk09IPq0}! zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55 z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7-&IFU%k-cX@}EYDW(jbSTiU; zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5 z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C* z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G& z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3 z^11uOcwV=iE!yv%gkz}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m# zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(an$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~ z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH= z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v0QeJ+SzF zDEl-W5_WywYl*+`24&HHdzIUh`aW_8k@UoVD78Q{W6RDjPK*l>4hnK zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl6FD~C^X)}>R+h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG z{}W9#n+BWB8nS7z*!3}KL$*27e9fNknzh%>N7H^y32%?fev zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7% zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y zm)E1`ni-BIP7BvBuOMy=NEUB~SD5pGH3NUgW zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo&e>RJARM}so`d+g* zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55 zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JAQaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`RU0;^c*s=Wt-e*9oE#UK3k z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K z#NPH^IsgXR?xC{ z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX zS-V>AG;uJ!8^?P3-l>1Uvv6*Z zOeTw6@Mp!MS=aAcSG;@WDlB$S9jF+?B<|W<*Ua#Q-9x?W z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1 ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7 zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pURV3jR0nY&1+CC z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8% zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27 zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+ zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0 zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n_rK|0Jfw{&qWKU#MO6SUa1eE(RG z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9 zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=uSmv#gB6|(p(ld9K|P^ zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66 zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl zQI}iI=O~ud)+DLZm zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2 z9KGe^MFW~5mGzusx_WLOfrI7L8}Bzc zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+ zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa zhaQJQqolRyA7g09$DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|ju~zL^ zO78FN)#30IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjjy?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w zGQ9g9yQ)cMRc2JwDT=5W zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z( zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx= z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!swfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r! zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58 z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M z54A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm-~ZSJwN_iOTloXp zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_5k%~3^ZbDb*f?fa)i!> zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9 z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e ztxeFe-|*hOD zIM@YH2MV(iG*rNIYjFDN0ZWg6wo4Db@ z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~ zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ zUx(7Cs`#dIg}nrG48lpMfaK#!L3}tzChl zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(c!f~JUb%q!1YUU^$Y9Yfv? z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv z1KutBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4 zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2! zd#(e)AbJ0Fwjr!wVHQmz#nmws5?agmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|% zD_b)&@O4JkCE@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r z;uHA7@#&eY5qQ17sqy_0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1* zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3iar%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l z<))}|JaVJ>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f z@f~YC3W9E@Gt!iwNUnmQ=d)JX1+ zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K= zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|! z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ# z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus z;~I;5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6 z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraCh21|sFabPB`K+jX9A~OM_$e7Ss z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i zwFPA}K=xxoniLpS6L=if0GU|4d;=vY-GTf z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;72$ zd0^h+{ua%kp+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7tksy@&4=3Fn@MBIw z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk> zyahXe%?j%@d^f6`wGRt!A8uWP_B4a zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{ zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0< zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KDoJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;; zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C* zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL z7KG})b6uSojYg%u(1rM0=6}{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g^eVx25m)tpui>mG74ML zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC19aC<= zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v zlG-Fx(qCWdhUt9`M%gJ-^eASkwW4YnS?~Y8J3Lw6WRQ z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@| z7;wJ&(lW|2VZsD6=PN(LC9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x< z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~ zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V< zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{ zwQTzgKtHg7wEWU06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4&r_tQ#Ek9v-QM@Z92|j?Wie# zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A% zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=L3#AgRgTA*-tOUe`c(_jlsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx- z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o# zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X_uaT&LqwQ0C-hfw7k z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKKz7Oz6L+QKy*Hg0(nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx)SGAVQ1Ip68YDkYpfQieS-8h+S0W zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU zHzGYA=?BfSgh4x}$4-GlUaJPbbyMqh$-3exM4 zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~* z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs) z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$H`1niP#%Z8e?sa;y5L@v zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh zX))5ZNPma46RB@YG}?ghyzq+9-u@<_8@MtK}=*C5S7 zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_* z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3 zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw- z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C* zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9 zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%` z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8 ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!= z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&>qErhJQ4z6tnW0$&<~zr@0?27VLpf zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6 z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRrcs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7 z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l zzgXG84cHpsHv;ePFT=D&xUFdC4&Yw{ewg0e6phY~DX$g{Y`^D$H`?OkcL9Gg z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7 zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa z&!2y*QaWq!3@1mieIKAoC?TWI`r zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3} z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!# z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr-1HfexOrj_5b`gt=p{&DF<#^r?O|C zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU2mwjA zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0 zA)vZGAmd%~xuqE#ve3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6 zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I= z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{VEjy?_*kdlmPjgdbC`!lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_ z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p* zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdPqdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw z9mO;6g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;os=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0VGqfEq>_(xD_22_R{rTk59m z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2 zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;< zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0 zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^ znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D zx3+7seOH;*zOTSrO}Pd(#{?)cEV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53p9L8os*dfM$IcB#qVb8x6) zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M; z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2 z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`= z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h z8t^-)q8k8wlPU^utNv&zvgWznTiOmeoe3#ik(xw7? zi^HLcI0zA`iVmyO(i}b}4z<%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj) zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZmOgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1) z%Lt|rHF^y8VONdRGe9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~ZA|XG?gi{bVAq}ZrTeiX37S# zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo z6>52R1E{(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@SOK@Go zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6 z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu> zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w?s%@S7CcX*gi_TYyYe=C^I_29qE_7wR* zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=0V{SI2PU_`9=FdRAz6X_r@bbj z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym} zGmgv6Nm=`-&*o#Z7Ar{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$ zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@II!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN zCK;~+{Sr@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65 zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR> zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h* zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U} zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2smtz@PrZS2kV+4E=}qHa?7Aq5tqptG!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*> zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c ziLq83dpIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!jlG8HH(Wx= zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62| z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6) zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJGHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h? zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{ejN4VF4tjAwm3XnF zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;( ztlTpSv>`;y=$I8{NDp!!I!4b5J<`ZCC}{)O4Y z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+BjW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q# z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8 z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR^(B?)u^H9}U*-(WcOuXVclfatIxv zp|`EzVh4$KYUClS`M;k=o=vas;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2 zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{ zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6 zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<) zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOIYFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9 zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@ zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@ z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l# z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H? z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7 zn>2+s?4mD{wwNl&K-?V>2!ZQeC}?sMce$tEZ24 zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O0Vo)&`oXe&0Ve9 z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)dJDVaxShcF(xr@t8JL2r;f zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*Me*Abu0QRdd=By;}e# zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGDBp;F_4hU14L2NT&@ykNiS35v0GP<{# z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9 z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828 z4yHz0&=)o7q2Y)~a?`FfB=F)u(3wnr#NW7ZOE6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy} zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=oymA3#i)t0=sI6iW(kt+8h;7 z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z zn=#^~I+-`f1u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3 za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1 zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<; zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7 z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2 zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh(=Ah6UD zl^rfaO7UNL`q-So}Nug;rxC{Q{9Qy5jq+4Px;XmQti9oGXZd{+&3| zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38 zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX zuK;~bPRH?lK9SFB_$)VdiLUZJc%~4J7zGIsbk>%j46> zIQdm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW** z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay` zo{hQormbts5jSE&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_} z8pg~1vq{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl z2LK#0oSp$}q2G&Ho;;na)C)LggBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3 z-!gvG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k zCxw!~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4 z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>dz&8=gM4L6 zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?) z=BckSpNr2lgy%5*eHzaR&*|A5_*aW}A zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC7y@DH*+=QI8l9w^&b z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ zfbIWXCYjIp-**~BA@fPh4Ae&2Y! zpZPy|qG8O_nEC=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe# z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O z9Q>~WehlJzx%p{`;8_egPI>Sfi z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93 zobhKf{v@`mZ6;fFGG1^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO; zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W}Te4Chb3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5 z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV zvr`Y|;PVXA7r$=k+5M}Z0p8T8#dLnEnEs=q z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+ z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ< zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2 z677gWUoW@kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0 z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR zCe7^i(ABoe;!~g`Zhd}Zr5>%RU%-S z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z zqQ0e1C|$|sSr|ZY` zYpTqw(bGwer-^Rq?cXGt@eKiPxKtNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7 zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d& z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P* z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1% zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~ z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D zCt}PyDO1$kIx!^#MhysC-wK&@aZF398nkF_OhUkn@X? zwBo8EJ*nkG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79 zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0 zgx-7yASw6!ua$ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" +#include "xgpu.h" + +#define N_INTS 128 + +// global variables +int DEBUG = 0; +const int n_all = 3194880; + +// to extract autocorrelation data +void auto_extract(float *output, float *specs); + +void auto_extract(float *output, float *specs) { + + int bctr = 0, idx, oidx = 0; + for (int a1=0;a1<63;a1++) { + for (int a2=0;a2<=a1;a2++) { + + if (a1==a2) { + for (int f=0;f<384;f++) { + for (int pol=0;pol<2;pol++) { + idx = 2*((bctr*384+f)*2+pol); + specs[oidx] += output[idx]; + } + oidx++; + } + } + bctr++; + + } + } + + +} + +// for extracting data +// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri) +void simple_extract(Complex *mat, float *output); + +void simple_extract(Complex *mat, float *output) { + + int in_idx, out_idx; + for (int bctr=0;bctr<2080;bctr++) { + for (int pol1=0;pol1<2;pol1++) { + + for (int f=0;f<384;f++) { + + out_idx = 2*((bctr*384+f)*2+pol1); + in_idx = (2*f*2080+bctr)*4+pol1*3; + output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real); + output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag); + + } + } + } + +} + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = TEST_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int arg = 0; + int output_specs = 0; + + while ((arg=getopt(argc,argv,"c:i:o:sdh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 's': + output_specs=1; + syslog (LOG_INFO, "Will output spectra files"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block; + uint64_t written, block_id; + Complex * cblock; + float *data = (float *)malloc(sizeof(float)*n_all); + + // spectra outputs + FILE *fout, *fmjd; + char fnam[100]; + float *specs = (float *)malloc(sizeof(float)*63*384); + float mjd; + int ctr = 0; + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + cblock = (Complex *)(block); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + + if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) { + syslog(LOG_ERR,"could not open fmjd"); + } + fscanf(fmjd,"%f",&mjd); + fclose(fmjd); + sprintf(fnam,"/home/ubuntu/data/specs_%f.dat",mjd); + + } + + // DO STUFF - from block to summed_vis + + if (DEBUG) syslog(LOG_DEBUG,"extracting..."); + simple_extract((Complex *)(block), data); + if (DEBUG) syslog(LOG_DEBUG,"extracted!"); + + // write to file if needed + if (output_specs==1) { + + if (ctr==0) + for (int i=0;i<63*384;i++) specs[i] = 0.; + + auto_extract(data, specs); + ctr += 1; + + if (ctr==N_INTS) { + fout = fopen(fnam,"a"); + for (int i=0;i<63*384;i++) + fprintf(fout, "%f\n", specs[i]); + fclose(fout); + ctr=0; + } + + } + + + // write to output + written = ipcio_write (hdu_out->data_block, (char *)data, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + for (int i=0;i<10;i++) { + syslog(LOG_INFO, "%g", data[i]); + printf("%g ", data[i]); + printf("\n"); + } + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(data); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_wrangleAndWrite.c b/legacy/dsaX_wrangleAndWrite.c new file mode 100644 index 0000000..6cd4a33 --- /dev/null +++ b/legacy/dsaX_wrangleAndWrite.c @@ -0,0 +1,365 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" +#include "xgpu.h" + +// global variables +int DEBUG = 0; +const int n_all = 3194880; +const int nbl = 2080; + +// for lookup table generation +// index is position to extract from xgpu array to output (Greg-style) array +void gen_lookup(int * idx_xgpu_in_greg); +void gen_lookup(int * idx_xgpu_in_greg) { + + // get antenna order in xgpu + int xgpu_ant_1[nbl], xgpu_ant_2[nbl], ct=0; + for (int i=0;i<64;i++) { + for (int j=0;j<=i;j++) { + xgpu_ant_1[ct] = j; + xgpu_ant_2[ct] = i; + ct++; + } + } + + // get antenna order in Greg + int gh_ant_1[nbl], gh_ant_2[nbl]; + ct=0; + for (int i=0;i<64;i++) { + for (int j=i;j<64;j++) { + gh_ant_1[ct] = i; + gh_ant_2[ct] = j; + ct++; + } + } + + // match antenna orders + for (int i=0;i= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block; + uint64_t written, block_id; + Complex * cblock; + float *data = (float *)malloc(sizeof(float)*n_all); + + + // set up + + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + cblock = (Complex *)(block); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF - from block to summed_vis + + if (DEBUG) syslog(LOG_DEBUG,"extracting..."); + simple_extract((Complex *)(block), data); + if (DEBUG) syslog(LOG_DEBUG,"extracted!"); + + // write to output + written = ipcio_write (hdu_out->data_block, (char *)data, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + for (int i=0;i<10;i++) { + syslog(LOG_INFO, "%g", data[i]); + printf("%g ", data[i]); + printf("\n"); + } + } + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(data); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dsaX_writeFil.c b/legacy/dsaX_writeFil.c new file mode 100644 index 0000000..751db9d --- /dev/null +++ b/legacy/dsaX_writeFil.c @@ -0,0 +1,486 @@ +/* This works pretty much like the trigger code. receives a control UDP message +to store some data for a fixed amount of time. +Message format: length(s)-NAME +Will ignore messages until data recording is over +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include + + +FILE *output; + +void send_string(char *string) /* includefile */ +{ + int len; + len=strlen(string); + fwrite(&len, sizeof(int), 1, output); + fwrite(string, sizeof(char), len, output); +} + +void send_float(char *name,float floating_point) /* includefile */ +{ + send_string(name); + fwrite(&floating_point,sizeof(float),1,output); +} + +void send_double (char *name, double double_precision) /* includefile */ +{ + send_string(name); + fwrite(&double_precision,sizeof(double),1,output); +} + +void send_int(char *name, int integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(int),1,output); +} + +void send_char(char *name, char integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(char),1,output); +} + + +void send_long(char *name, long integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(long),1,output); +} + +void send_coords(double raj, double dej, double az, double za) /*includefile*/ +{ + if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); + if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); + if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); + if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); +} + + +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +int trignum = 0; +int dumpnum = 0; +char iP[100]; +char srcnam[1024]; +float reclen; +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); +void convert_block(char * b1, char * b2); + +void usage() +{ + fprintf (stdout, + "dsaX_image [options]\n" + " -c core bind process to CPU core\n" + " -b write one beam\n" + " -f filename base [default test.fil]\n" + " -k in_key [BF_BLOCK_KEY]\n" + " -i IP to listen to [no default]\n" + " -s integrate N ints MUST BE FACTOR OF 16384 [default 1]\n" + " -m get mjd from file\n" + " -d DEBUG\n" + " -h print usage\n"); +} + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +// Thread to control the dumping of data + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = WRITEVIS_CONTROL_PORT; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + float tmp_reclen; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + trignum++; + + // interpret buffer string + char * rest = buffer; + tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr)); + char * tmp_srcnam = strtok(NULL, "-"); + + if (!dump_pending) { + reclen = tmp_reclen; + strcpy(srcnam,tmp_srcnam); + syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam); + } + + if (dump_pending) + syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam); + + if (!dump_pending) dump_pending = 1; + + close(fd); + + } + + free (buffer); + + if (ctx->verbose) + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_writeFil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA defs */ + dada_hdu_t* hdu_in = 0; + multilog_t* log = 0; + key_t in_key = BF_BLOCK_KEY; + + /* actual struct with info */ + udpdb_t udpdb; + + // command line + int arg = 0; + int core = -1; + float fch1 = 1530.0; + char fnam[300], foutnam[400]; + sprintf(fnam,"/home/dsa/alltest"); + + // for getting MJD + FILE *fmjd; + int get_mjd = 0; + int sumi=1; + int onebeam=0; + + while ((arg=getopt(argc,argv,"c:f:o:i:k:s:bmdh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + printf ("ERROR: -c flag requires argument\n"); + return EXIT_FAILURE; + } + case 'k': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-k flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + strcpy(fnam,optarg); + break; + case 'i': + strcpy(iP,optarg); + break; + case 'd': + DEBUG=1; + break; + case 'b': + onebeam=1; + break; + case 'm': + get_mjd=1; + break; + case 's': + sumi = atoi(optarg); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // DADA stuff + + udpdb.verbose = 1; + + syslog (LOG_INFO, "dsaX_writefil: creating hdu"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"dsaX_writefil: could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"dsaX_writespec: could not lock to dada buffer"); + return EXIT_FAILURE; + } + + // Bind to cpu core + if (core >= 0) + { + syslog(LOG_INFO,"binding to core %d", core); + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"dsaX_writefil: failed to bind to core %d", core); + } + + int observation_complete=0; + + // more DADA stuff - deal with headers + + uint64_t header_size = 0; + + // read the headers from the input HDUs and mark as cleared + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "main: could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + + // start control thread + int rval = 0; + pthread_t control_thread_id; + syslog(LOG_INFO, "starting control_thread()"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + + // set up + int fctr = 0, integration = 0; + char tstamp[100]; + double mjd=55000.; + int rownum = 1; + int dfwrite = 0; + float mytsamp = 4.*8.*8.192e-6; + int NINTS, midx; + + // data stuff + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t bytes_read = 0, block_id; + char *block; + float *hoblock = (float *)malloc(sizeof(float)*64*1024*16384/sumi); + + // start things + + syslog(LOG_INFO, "dsaX_writespec: starting observation"); + int nblocks = 0; + + while (!observation_complete) { + + // read block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + if (DEBUG) for (int i=0;i<48;i++) syslog(LOG_INFO,"%hu",((unsigned char *)(block))[i]); + + for (int i=0;i<64*1024*16384/sumi;i++) hoblock[i] = 0.; + + // for writing sum + /* for (int i=0;i<256*48;i++) oblock[i] = 0.; + for (int i=0;i<128;i++) { + for (int j=0;j<256*48;j++) oblock[j] += (float)(block[i*256*48+j]); + }*/ + + syslog(LOG_INFO,"read block %d",nblocks); + + // check for dump_pending + if (dump_pending) { + + // if file writing hasn't started + if (dfwrite==0) { + + syslog(LOG_INFO, "beginning file write for SRC %s for %f s",srcnam,reclen); + + NINTS = (int)(floor(reclen/(mytsamp*16384.))); + //NINTS = (int)(floor(reclen/(0.134217728))); + sprintf(foutnam,"%s_%s_%d_%d.fil",fnam,srcnam,fctr,nblocks); + syslog(LOG_INFO, "main: opening new file %s",foutnam); + + if (!(output = fopen(foutnam,"wb"))) { + printf("Couldn't open output file\n"); + return 0; + } + + if (get_mjd==1) { + if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) { + syslog(LOG_ERR,"could not open fmjd"); + } + fscanf(fmjd,"%lf",&mjd); + mjd += nblocks*4.294967296/86400.; + fclose(fmjd); + } + + + send_string("HEADER_START"); + send_string("source_name"); + send_string(srcnam); + send_int("machine_id",1); + send_int("telescope_id",82); + send_int("data_type",1); // filterbank data + send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) + send_double("foff",-0.244140625); + send_int("nchans",1024); + if (sumi==1) send_int("nbits",8); + else send_int("nbits",32); + send_double("tstart",mjd); + send_double("tsamp",8.192e-6*8.*4.*sumi); + send_int("nifs",1); + send_string("HEADER_END"); + + syslog(LOG_INFO, "main: opened new file %s",foutnam); + + dfwrite=1; + + + } + + // write data to file + syslog(LOG_INFO,"writing"); + + + for (int i=0;i<64;i++) { + for (int j=0;j<16384/sumi;j++) { + for (int k=0;kdata_block, bytes_read); + nblocks += 1; + + } + + // close control thread + syslog(LOG_INFO, "joining control_thread"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + + free(hoblock); + dsaX_dbgpu_cleanup(hdu_in); + +} diff --git a/legacy/dsaX_writevis.c b/legacy/dsaX_writevis.c new file mode 100644 index 0000000..02cebb7 --- /dev/null +++ b/legacy/dsaX_writevis.c @@ -0,0 +1,428 @@ +/* This works pretty much like the trigger code. receives a control UDP message +to store some data for a fixed amount of time. +Message format: length(s)-NAME +Will ignore messages until data recording is over +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" +#include "fitsio.h" +#include "xgpu.h" + +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +int trignum = 0; +int dumpnum = 0; +char iP[100]; +char srcnam[1024]; +float reclen; +int DEBUG = 0; + +// assumes that only first 78 baselines are written and 384 channels and 2 pols +const int n = 9216; +float summed_vis[9216]; +const int n_all = 3194880; + +// for extracting data +// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri) +void simple_extract(Complex *mat, float *output); + +void simple_extract(Complex *mat, float *output) { + + int in_idx, out_idx; + for (int bctr=0;bctr<2080;bctr++) { + for (int pol1=0;pol1<2;pol1++) { + + for (int f=0;f<384;f++) { + + out_idx = 2*((bctr*384+f)*2+pol1); + in_idx = (2*f*2080+bctr)*4+pol1*3; + output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real); + output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag); + + } + } + } + +} + + + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); + +void usage() +{ + fprintf (stdout, + "dsaX_image [options]\n" + " -c core bind process to CPU core\n" + " -d debug [default no]\n" + " -k in_key [default XGPU_BLOCK_KEY]\n" + " -f filename base [default test.fits]\n" + " -o freq of chan 1 [default 1494.84375]\n" + " -i IP to listen to [no default]\n" + " -h print usage\n"); +} + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +// Thread to control the dumping of data + +void control_thread (void * arg) { + + udpdb_t * ctx = (udpdb_t *) arg; + syslog(LOG_INFO, "control_thread: starting"); + + // port on which to listen for control commands + int port = WRITEVIS_CONTROL_PORT; + char sport[10]; + sprintf(sport,"%d",port); + + // buffer for incoming command strings, and setup of socket + int bufsize = 1024; + char* buffer = (char *) malloc (sizeof(char) * bufsize); + memset(buffer, '\0', bufsize); + const char* whitespace = " "; + char * command = 0; + char * args = 0; + + struct addrinfo hints; + struct addrinfo* res=0; + memset(&hints,0,sizeof(hints)); + struct sockaddr_storage src_addr; + socklen_t src_addr_len=sizeof(src_addr); + hints.ai_family=AF_INET; + hints.ai_socktype=SOCK_DGRAM; + getaddrinfo(iP,sport,&hints,&res); + int fd; + ssize_t ct; + char tmpstr; + char cmpstr = 'p'; + char *endptr; + float tmp_reclen; + + syslog(LOG_INFO, "control_thread: created socket on port %d", port); + + while (!quit_threads) { + + fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + trignum++; + + // interpret buffer string + char * rest = buffer; + tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr)); + char * tmp_srcnam = strtok(NULL, "-"); + + if (!dump_pending) { + reclen = tmp_reclen; + strcpy(srcnam,tmp_srcnam); + syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam); + } + + if (dump_pending) + syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam); + + if (!dump_pending) dump_pending = 1; + + close(fd); + + } + + free (buffer); + + if (ctx->verbose) + syslog(LOG_INFO, "control_thread: exiting"); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_writevis", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA defs */ + dada_hdu_t* hdu_in = 0; + multilog_t* log = 0; + key_t in_key = XGPU_BLOCK_KEY; + + /* actual struct with info */ + udpdb_t udpdb; + + // command line + int arg = 0; + int core = -1; + float fch1 = 1500.0; + int nchans = 384; + char fnam[300], foutnam[400]; + sprintf(fnam,"/home/ubuntu/alltest"); + + while ((arg=getopt(argc,argv,"c:f:o:i:k:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + printf ("ERROR: -c flag requires argument\n"); + return EXIT_FAILURE; + } + case 'k': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-k flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + strcpy(fnam,optarg); + break; + case 'd': + DEBUG=1; + break; + case 'o': + fch1 = atof(optarg); + break; + case 'i': + strcpy(iP,optarg); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // DADA stuff + + udpdb.verbose = 1; + + syslog (LOG_INFO, "dsaX_writevis: creating hdu"); + + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"dsaX_writevis: could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"dsaX_writevis: could not lock to dada buffer"); + return EXIT_FAILURE; + } + + // Bind to cpu core + if (core >= 0) + { + syslog(LOG_INFO,"binding to core %d", core); + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"dsaX_writevis: failed to bind to core %d", core); + } + + int observation_complete=0; + + // more DADA stuff - deal with headers + + uint64_t header_size = 0; + + // read the headers from the input HDUs and mark as cleared + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "main: could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + + // start control thread + int rval = 0; + pthread_t control_thread_id; + syslog(LOG_INFO, "starting control_thread()"); + rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); + if (rval != 0) { + syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval)); + return -1; + } + + // set up + int fctr = 0, integration = 0; + fitsfile *fptr; + int rownum = 1; + int fwrite = 0; + int status=0; + float mytsamp = 4096*4*8.192e-6; + int NINTS; + + // data stuff + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t bytes_read = 0, block_id; + char *block; + float *data = (float *)malloc(sizeof(float)*n_all); + int si1, si2; + int nblocks = 0; + Complex * cblock; + + // start things + + syslog(LOG_INFO, "dsaX_writevis: starting observation"); + + while (!observation_complete) { + + // read block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + cblock = (Complex *)(block); + + if (DEBUG) { + if (nblocks==20) { + for (int i=100;i<200;i++) { + syslog(LOG_DEBUG,"MAT %d %f %f",i,(float)(cblock[i].real),(float)(cblock[i].imag)); + } + } + } + + // DO STUFF - from block to summed_vis + + if (DEBUG) syslog(LOG_DEBUG,"extracting..."); + simple_extract((Complex *)(block), data); + for (int i=0;idata_block, bytes_read); + nblocks++; + + if (DEBUG) syslog(LOG_DEBUG,"Finished block %d",nblocks); + + } + + // close control thread + syslog(LOG_INFO, "joining control_thread"); + quit_threads = 1; + void* result=0; + pthread_join (control_thread_id, &result); + + free(data); + dsaX_dbgpu_cleanup(hdu_in); + +} diff --git a/legacy/dsaX_xgpu.cu b/legacy/dsaX_xgpu.cu new file mode 100644 index 0000000..d065848 --- /dev/null +++ b/legacy/dsaX_xgpu.cu @@ -0,0 +1,375 @@ +// -*- c++ -*- +/* will run xgpu */ +/* assumes input block size is appropriate */ +#define THRUST_IGNORE_CUB_VERSION_CHECK + +#include +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +//#include "dada_cuda.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include "cube/cube.h" +#include "xgpu.h" + +/* global variables */ +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +// kernel for fluffing +// run with 6291456 blocks of 32 threads +__global__ void promoter(char *input, char *output) { + + int idx = blockIdx.x*32 + threadIdx.x; + char v = input[idx]; + + //output[2*idx] = ((v<<4) & 240) >> 4; + //output[2*idx+1] = v >> 4; + output[2*idx] = (char)(((unsigned char)(v) & (unsigned char)(15)) << 4) >> 4; + output[2*idx+1] = (char)(((unsigned char)(v) & (unsigned char)(240))) >> 4; + +} + +void usage() +{ +fprintf (stdout, + "dsaX_xgpu [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -h print usage\n"); +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_xgpu", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = REORDER_BLOCK_KEY; + key_t out_key = XGPU_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block; + char * output_buffer; + output_buffer = (char *)malloc(sizeof(char)*block_out); + uint64_t written, block_id; + + + // set up xgpu + + // register input hdu with gpu + //dada_cuda_dbregister(hdu_in); + + // structures and definitions + XGPUInfo xgpu_info; + int syncOp = SYNCOP_DUMP; + int xgpu_error = 0; + xgpuInfo(&xgpu_info); + XGPUContext context; + context.array_h = NULL; + context.matrix_h = NULL; + xgpu_error = xgpuInit(&context, 0); + if(xgpu_error) { + syslog(LOG_ERR, "xGPU error %d", xgpu_error); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + ComplexInput *array_h = context.array_h; // this is pinned memory + Complex *cuda_matrix_h = context.matrix_h; + memset((char *)array_h,0,2*context.array_len); + + syslog(LOG_INFO,"Set up xgpu with input size %d output size %d",context.array_len,context.matrix_len); + + // set up data input for fluffing + char * h_din = (char *)malloc(sizeof(char)*context.array_len); + char *d_din, *d_dout; + cudaMalloc((void **)&d_din, context.array_len*sizeof(char)); + cudaMalloc((void **)&d_dout, 2*context.array_len*sizeof(char)); + + // do prestart + syslog(LOG_INFO, "pre-starting..."); + char * tmp_data = (char *)malloc(sizeof(char)*context.array_len); + memset(tmp_data, 1, context.array_len); + for (int i=0;i<10;i++) { + + cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice); + promoter<<<6291456,32>>>(d_din,d_dout); + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); + xgpuClearDeviceIntegrationBuffer(&context); + + } + + free(tmp_data); + syslog(LOG_INFO, "finished with pre-start"); + + // get things started + bool observation_complete=0; + bool started = 0; + syslog(LOG_INFO, "starting observation"); + int blocks = 0; + + while (!observation_complete) { + + if (DEBUG) syslog(LOG_DEBUG,"reading block"); + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // DO STUFF + + for (int myint=0;myint>>(d_din,d_dout); + //cudaMemcpy((char *)(array_h),d_dout,2*context.array_len*sizeof(char),cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + // run xgpu + //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); + xgpu_error = xgpuCudaXengine(&context, syncOp); + if(xgpu_error) { + syslog(LOG_ERR, "xGPU error %d\n", xgpu_error); + return EXIT_FAILURE; + } + + if (started==0 && blocks==20) { + syslog(LOG_INFO,"now in RUN state"); + if (DEBUG) { + for (int i=100;i<200;i++) { + syslog(LOG_DEBUG,"INPUT %hhi %hhi",array_h[i].real,array_h[i].imag); + syslog(LOG_DEBUG,"OUTPUT %g %g",(float)(cuda_matrix_h[i].real),(float)(cuda_matrix_h[i].imag)); + } + } + started=1; + } + + // clear device + xgpuClearDeviceIntegrationBuffer(&context); + + // write to output + + written = ipcio_write (hdu_out->data_block, (char *)(cuda_matrix_h), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + } + + // finish up + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + // finish up + free(output_buffer); + free(h_din); + cudaFree(d_din); + cudaFree(d_dout); + //dada_cuda_dbunregister(hdu_in); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} + + diff --git a/legacy/dumpfil.c b/legacy/dumpfil.c new file mode 100644 index 0000000..0be913c --- /dev/null +++ b/legacy/dumpfil.c @@ -0,0 +1,294 @@ +//E_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" + +// global variables +int DEBUG = 0; + +void usage() +{ + fprintf (stdout, + "dumpfil [options]\n" + " -d send debug messages to syslog\n" + " -p no header\n" + " -f file to dump to [default none]\n" + " -n blocks to dump [default 30]\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -g ignore first block\n" + " -h print usage\n"); +} + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in); + + +void dsaX_dbgpu_cleanup (dada_hdu_t * in) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + +} + +FILE *output; + +void send_string(char *string) /* includefile */ +{ + int len; + len=strlen(string); + fwrite(&len, sizeof(int), 1, output); + fwrite(string, sizeof(char), len, output); +} + +void send_float(char *name,float floating_point) /* includefile */ +{ + send_string(name); + fwrite(&floating_point,sizeof(float),1,output); +} + +void send_double (char *name, double double_precision) /* includefile */ +{ + send_string(name); + fwrite(&double_precision,sizeof(double),1,output); +} + +void send_int(char *name, int integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(int),1,output); +} + +void send_char(char *name, char integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(char),1,output); +} + + +void send_long(char *name, long integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(long),1,output); +} + +void send_coords(double raj, double dej, double az, double za) /*includefile*/ +{ + if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); + if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); + if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); + if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); +} + + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dumpfil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + + // data block HDU keys + key_t in_key = 0x0000aaae; + + // command line arguments + char fnam[100]; + sprintf(fnam,"/home/ubuntu/dumpfil.fil"); + int nbl = 30; + int arg = 0; + int nhd = 0; + int igblock = 0; + + while ((arg=getopt(argc,argv,"f:i:n:pdgh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + strcpy(fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'n': + if (optarg) + { + nbl = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-n flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'p': + nhd=1; + syslog (LOG_INFO, "Will not write a header"); + break; + case 'g': + igblock=1; + syslog (LOG_INFO, "Will ignore first block"); + break; + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + syslog(LOG_INFO,"will use %d blocks",nbl); + + // DADA stuff + + syslog (LOG_INFO, "creating in hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in); + return EXIT_FAILURE; + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + syslog(LOG_INFO, "main: have input block size %lu\n",block_size); + uint64_t bytes_read = 0; + uint64_t npackets = 1; + char * block, * output_buffer; + uint64_t written, block_id; + + // fill output buffer if file exists + output=fopen(fnam,"wb"); + if(output == NULL) + { + syslog(LOG_ERR,"Error opening file"); + exit(1); + } + + if (!nhd) { + send_string("HEADER_START"); + send_string("source_name"); + send_string("TESTSRC"); + send_int("machine_id",1); + send_int("telescope_id",82); + send_int("data_type",1); // filterbank data + send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) + send_double("foff",-0.244140625); + send_int("nchans",1024); + send_int("nbits",8); + send_double("tstart",55000.0); + send_double("tsamp",8.192e-6*8.*16.); + send_int("nifs",1); + send_string("HEADER_END"); + } + + int observation_complete=0; + int blocks = 0, started = 0; + + syslog(LOG_INFO, "starting observation"); + + + while (blocks < nbl) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (!igblock || started!=0) { + fwrite(block, sizeof(char), bytes_read, output); + blocks++; + } + + if (started==0) started=1; + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + fclose(output); + dsaX_dbgpu_cleanup (hdu_in); + +} diff --git a/legacy/fil2dada.c b/legacy/fil2dada.c new file mode 100644 index 0000000..c49f2b5 --- /dev/null +++ b/legacy/fil2dada.c @@ -0,0 +1,521 @@ +//E_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +//#include "ascii_header.h" +//#include "dsaX_capture.h" +//#include "dsaX_def.h" + +// global variables +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); +int dada_bind_thread_to_core (int core); + +/* read fil file header variables */ +char rawdatafile[80], source_name[80]; +int machine_id, telescope_id, data_type, nchans, nbits, nifs, scan_number, + barycentric,pulsarcentric; /* these two added Aug 20, 2004 DRL */ +double tstart,mjdobs,tsamp,fch1,foff,refdm,az_start,za_start,src_raj,src_dej; +double gal_l,gal_b,header_tobs,raw_fch1,raw_foff; +int nbeams, ibeam; +/* added 20 December 2000 JMC */ +double srcl,srcb; +double ast0, lst0; +long wapp_scan_number; +char project[8]; +char culprits[24]; +double analog_power[2]; +/* added frequency table for use with non-contiguous data */ +double frequency_table[4096]; /* note limited number of channels */ +long int npuls; /* added for binary pulse profile format */ + + +int nbins; +double period; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +{ + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_out"); + } + dada_hdu_destroy (out); + +} + +/* +void get_string(FILE *inputfile, int *nbytes, char string[]) +{ + int nchar; + size_t nRead; + strcpy(string,"ERROR"); + nRead = fread(&nchar, sizeof(int), 1, inputfile); + if (feof(inputfile)) exit(0); + if (nchar>80 || nchar<1) return; + *nbytes=sizeof(int); + nRead = fread(string, nchar, 1, inputfile); + string[nchar]='\0'; + *nbytes+=nchar; +} +*/ + +int read_header(FILE *inputfile); +/* +int read_header(FILE *inputfile) +{ + size_t nRead; + char string[80], message[80]; + int itmp,nbytes,totalbytes,expecting_rawdatafile=0,expecting_source_name=0; + int expecting_frequency_table=0,channel_index; + + + + get_string(inputfile,&nbytes,string); + if (!strcmp(string,"HEADER_START")) + rewind(inputfile); + return 0; + } + totalbytes=nbytes; + + while (1) { + get_string(inputfile,&nbytes,string); + if (strcmp(string,"HEADER_END")) break; + totalbytes+=nbytes; + if (strcmp(string,"rawdatafile")) { + expecting_rawdatafile=1; + } else if (strcmp(string,"source_name")) { + expecting_source_name=1; + } else if (strcmp(string,"FREQUENCY_START")) { + expecting_frequency_table=1; + channel_index=0; + } else if (strcmp(string,"FREQUENCY_END")) { + expecting_frequency_table=0; + } else if (strcmp(string,"az_start")) { + nRead = fread(&az_start,sizeof(az_start),1,inputfile); + totalbytes+=sizeof(az_start); + } else if (strcmp(string,"za_start")) { + nRead = fread(&za_start,sizeof(za_start),1,inputfile); + totalbytes+=sizeof(za_start); + } else if (strcmp(string,"src_raj")) { + nRead = fread(&src_raj,sizeof(src_raj),1,inputfile); + totalbytes+=sizeof(src_raj); + } else if (strcmp(string,"src_dej")) { + nRead = fread(&src_dej,sizeof(src_dej),1,inputfile); + totalbytes+=sizeof(src_dej); + } else if (strcmp(string,"tstart")) { + nRead = fread(&tstart,sizeof(tstart),1,inputfile); + totalbytes+=sizeof(tstart); + } else if (strcmp(string,"tsamp")) { + nRead = fread(&tsamp,sizeof(tsamp),1,inputfile); + totalbytes+=sizeof(tsamp); + } else if (strcmp(string,"period")) { + nRead = fread(&period,sizeof(period),1,inputfile); + totalbytes+=sizeof(period); + } else if (strcmp(string,"fch1")) { + nRead = fread(&fch1,sizeof(fch1),1,inputfile); + totalbytes+=sizeof(fch1); + } else if (strcmp(string,"fchannel")) { + nRead = fread(&frequency_table[channel_index++],sizeof(double),1,inputfile); + totalbytes+=sizeof(double); + fch1=foff=0.0; + } else if (strcmp(string,"foff")) { + nRead = fread(&foff,sizeof(foff),1,inputfile); + totalbytes+=sizeof(foff); + } else if (strcmp(string,"nchans")) { + nRead = fread(&nchans,sizeof(nchans),1,inputfile); + totalbytes+=sizeof(nchans); + } else if (strcmp(string,"telescope_id")) { + nRead = fread(&telescope_id,sizeof(telescope_id),1,inputfile); + totalbytes+=sizeof(telescope_id); + } else if (strcmp(string,"machine_id")) { + nRead = fread(&machine_id,sizeof(machine_id),1,inputfile); + totalbytes+=sizeof(machine_id); + } else if (strcmp(string,"data_type")) { + nRead = fread(&data_type,sizeof(data_type),1,inputfile); + totalbytes+=sizeof(data_type); + } else if (strcmp(string,"ibeam")) { + nRead = fread(&ibeam,sizeof(ibeam),1,inputfile); + totalbytes+=sizeof(ibeam); + } else if (strcmp(string,"nbeams")) { + nRead = fread(&nbeams,sizeof(nbeams),1,inputfile); + totalbytes+=sizeof(nbeams); + } else if (strcmp(string,"nbits")) { + nRead = fread(&nbits,sizeof(nbits),1,inputfile); + totalbytes+=sizeof(nbits); + } else if (strcmp(string,"barycentric")) { + nRead = fread(&barycentric,sizeof(barycentric),1,inputfile); + totalbytes+=sizeof(barycentric); + } else if (strcmp(string,"pulsarcentric")) { + nRead = fread(&pulsarcentric,sizeof(pulsarcentric),1,inputfile); + totalbytes+=sizeof(pulsarcentric); + } else if (strcmp(string,"nbins")) { + nRead = fread(&nbins,sizeof(nbins),1,inputfile); + totalbytes+=sizeof(nbins); + } else if (strcmp(string,"nsamples")) { + nRead = fread(&itmp,sizeof(itmp),1,inputfile); + totalbytes+=sizeof(itmp); + } else if (strcmp(string,"nifs")) { + nRead = fread(&nifs,sizeof(nifs),1,inputfile); + totalbytes+=sizeof(nifs); + } else if (strcmp(string,"npuls")) { + nRead = fread(&npuls,sizeof(npuls),1,inputfile); + totalbytes+=sizeof(npuls); + } else if (strcmp(string,"refdm")) { + nRead = fread(&refdm,sizeof(refdm),1,inputfile); + totalbytes+=sizeof(refdm); + } else if (expecting_rawdatafile) { + strcpy(rawdatafile,string); + expecting_rawdatafile=0; + } else if (expecting_source_name) { + strcpy(source_name,string); + expecting_source_name=0; + } else { + sprintf(message,"read_header - unknown parameter: %s\n",string); + fprintf(stderr,"ERROR: %s\n",message); + exit(1); + } + } + + + totalbytes+=nbytes; + + return totalbytes; +} +*/ + +void usage() +{ + fprintf (stdout, + "dsaX_fake [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -f file to read packet from [default none]\n" + " -i in_key [default TEST_BLOCK_KEY]\n" + " -o out_key [default REORDER_BLOCK_KEY2]\n" + " -n will not read header\n" + " -b number of blocks to stop after\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = 0x0000dada; + key_t out_key = 0x0000caca; + + // command line arguments + int core = -1; + int useZ = 1; + char fnam[100]; + int arg = 0; + int rhead = 1; + int nblocks = -1; + + while ((arg=getopt(argc,argv,"c:f:i:o:nb:dh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + useZ = 0; + strcpy(fnam,optarg); + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'b': + if (optarg) + { + nblocks = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-b flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'n': + rhead=0; + syslog (LOG_INFO, "Will not read header"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + uint64_t npackets = 1; + char * block, * output_buffer; + char * packet; + packet = (char *)malloc(sizeof(char)*block_size); + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,0,block_out); + uint64_t written, block_id; + + // fill output buffer if file exists + FILE *fin; + if (!useZ) { + + if (!(fin=fopen(fnam,"rb"))) { + syslog(LOG_ERR, "cannot open file - will write zeros"); + } + else { + + // DMH: FIXME + //if (rhead) read_header(fin); + + // fread(packet,block_out,1,fin); + // fclose(fin); + + // syslog(LOG_INFO,"Read packet, npackets %llu",npackets); + + // for (int i=0;idata_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + // no need to do anything here - output_buffer is ready to go + + // fread goes here + // count blocks, increment, stop loop and reopen file (or rewind) + + // write to output + written = ipcio_write (hdu_out->data_block, packet, block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) { + syslog(LOG_DEBUG, "written block %d",blocks); + } + blocks++; + + if (blocks==nblocks) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + fclose(fin); + free(packet); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + +} diff --git a/legacy/flagger.c b/legacy/flagger.c new file mode 100644 index 0000000..5262015 --- /dev/null +++ b/legacy/flagger.c @@ -0,0 +1,484 @@ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" + +#define NTIMES_P 4096 // # of time samples (assuming 1ms sampling period) +#define NCHAN_P 1024 // # of channels on BF node side +#define NBEAMS_P 64 // # of beams on BF side +#define M_P NTIMES_P +#define N_P 32 +#define HDR_SIZE 4096 +#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet + +// global variables +int DEBUG = 0; +double skarray[NBEAMS_P*NCHAN_P+1]; // array with SK values -- size NCHANS * NBEAMS +double avgspec[NBEAMS_P*NCHAN_P+1]; // spectrum over all beams to estimate median filter +double baselinecorrec[NBEAMS_P*NCHAN_P+1]; // spectrum over all beams to estimate median filter +int cores[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25}; + +void swap(char *p,char *q) { + char t; + + t=*p; + *p=*q; + *q=t; +} + +double medval(double a[],int n) { + int i,j; + char tmp[n]; + for (i = 0;i < n;i++) + tmp[i] = a[i]; + + for(i = 0;i < n-1;i++) { + for(j = 0;j < n-i-1;j++) { + if(tmp[j] > tmp[j+1]) + swap(&tmp[j],&tmp[j+1]); + } + } + return tmp[(n+1)/2-1]; +} + +/* THREAD FUNCTION */ + +struct data { + unsigned char * indata; + double * inSK; + unsigned char * output; + int cnt; + double nThreshUp; + int n_threads; + int thread_id; + int debug; +}; + +void noise_inject(void *args) { + + struct data *d = args; + int thread_id = d->thread_id; + int dbg = d->debug; + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + + // noise injection + + unsigned char *indata = (unsigned char *)d->indata; + double *inSK = (double *)d->inSK; + unsigned char *output = (unsigned char *)d->output; + int * cnt = (int *)d->cnt; + double nThreshUp = (double)d->nThreshUp; + int nthreads = d->n_threads; + int i, j, k; + + // copy from input to output + //memcpy(output,indata,(NBEAMS_P/nthreads)*NTIMES_P*NCHAN_P); + + //cnt[thread_id] = 0; + + for (i = 0; i < (int)(NBEAMS_P/nthreads); i++){ + for (k = 0; k < NCHAN_P; k++){ + if (inSK[i*(int)(NCHAN_P) + k] > nThreshUp){ + cnt[thread_id]++; + //if (dbg) syslog(LOG_DEBUG,"thread %d: flagging %d %d: sk %g",thread_id,i,k,inSK[i*(int)(NCHAN_P) + k]); + //for (j = 0; j < NTIMES_P; j++){ + //output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.); + //indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * 1. / ( (double)RAND_MAX ) + 10.); + //} + + // copy from lookup table + for (j = 0; j < NTIMES_P; j++) + indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = output[k*NTIMES_P+j]; + + } + /*else{ + for (j = 0; j < NTIMES_P; j++){ + output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]; + } + }*/ + } + } + + + + if (dbg) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id); + int thread_result = 0; + pthread_exit((void *) &thread_result); +} + +/* END THREAD FUNCTION */ + +void usage() +{ + fprintf (stdout, + "flagger [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default dada]\n" + " -o out_key [default caca]\n" + " -n use noise generation rather than zeros\n" + " -t SK threshold [default 5.0]\n" + " -b compute and apply baseline correction\n" + " -h print usage\n"); +} + + +int main(int argc, char**argv) +{ + + // syslog start + openlog ("flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // threads initialization + int nthreads = 16; + pthread_t threads[nthreads]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + // read command line args + + // data block HDU keys + key_t in_key = 0x0000dada; + key_t out_key = 0x0000caca; + + // command line arguments + int core = -1; + int arg = 0; + int noise = 0; + double skthresh = 5.0; + int bcorr = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:bndh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + skthresh = atof(optarg); + syslog(LOG_INFO,"modified SKTHRESH to %g",skthresh); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'n': + noise=1; + syslog (LOG_INFO, "Will generate noise samples"); + break; + case 'b': + bcorr=1; + syslog (LOG_INFO, "Will calculate and apply baseline correction"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // CONNECT AND READ FROM BUFFER + + dada_hdu_t* hdu_in = 0; // header and data unit + uint64_t blocksize = NTIMES_P*NCHAN_P*NBEAMS_P; // size of buffer + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to input buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to input buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + // read the header from the input HDU + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + + // mark the input header as cleared + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){ + syslog (LOG_ERR,"could not mark header as cleared"); + return EXIT_FAILURE; + } + + uint64_t block_id, bytes_read = 0; + unsigned char *in_data; + char *cin_data; + + // OUTPUT BUFFER + dada_hdu_t* hdu_out = 0; + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"flagged_data: could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write (hdu_out) < 0) { + syslog (LOG_ERR,"flagged_data: could not lock to dada buffer"); + return EXIT_FAILURE; + } + + /* //read fake header for now + char head_dada[4096]; + FILE *f = fopen("/home/dsa/dsa110-xengine/src/correlator_header_dsaX.txt", "rb"); + fread(head_dada, sizeof(char), 4096, f); + fclose(f); */ + + //// OUTPUT BUFFER + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + header_size = HDR_SIZE; + if (!header_out) + { + syslog(LOG_ERR,"couldn't read header_out"); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + return EXIT_FAILURE; + } + uint64_t written=0; + + //////////////// + + double S1 = 0; + double S2 = 0; + double sampval; + double nThreshUp = skthresh; // Threshold to apply to SK (empirical estimation) + struct data args[16]; + int * flag_counts = (int *)malloc(sizeof(int)*nthreads); + //unsigned char * output = (unsigned char *)malloc(sizeof(char)*NBEAMS_P*NCHAN_P*NTIMES_P); + int nFiltSize = 21; + int cnt = 0; + + // make array of random numbers + unsigned char * lookup_rand = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P); + for (int i=0;idata_block, &bytes_read, &block_id); + in_data = (unsigned char *)(cin_data); + + // compute SK and averaged spectrum + S1 = 0; + S2 = 0; + sampval = 0; + + for (int i = 0; i < NBEAMS_P; i++){ + for (int k = 0; k < NCHAN_P; k++){ + for (int j = 0; j < NTIMES_P; j++){ + sampval = (double)in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]; + avgspec[i*(int)(NCHAN_P) + k] += sampval / NTIMES_P; + S1 += sampval; + S2 += sampval * sampval; + skarray[i*(int)(NCHAN_P) + k] = (double)((M_P*N_P+1) / (M_P-1) * ( (M_P*S2)/(S1*S1) - 1 )); + } + S1 = 0; + S2 = 0; + } + } + if (DEBUG) syslog (LOG_DEBUG,"has computed SK."); + if (DEBUG) syslog(LOG_DEBUG,"example SK value : %g", (double)skarray[10]); + + // compute baseline correction + if (bcorr) { + for (int i = 0; i < NBEAMS_P*NCHAN_P-nFiltSize; i++) + baselinecorrec[i] = medval(&avgspec[i],nFiltSize); + } + + + // compare SK values to threshold and + // replace thresholded channels with noise or 0 + + if (noise){ + + for (int i=0;i nThreshUp){ + cnt++; + for (int j = 0; j < NTIMES_P; j++){ + in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = 0; + } + } + } + } + } + syslog (LOG_INFO,"%d channels*baselines flagged",cnt); + + // apply baseline correction + if (bcorr) { + for (int i = 0; i < NBEAMS_P; i++){ + for (int k = 0; k < NCHAN_P; k++){ + for (int j = 0; j < NTIMES_P; j++){ + //in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] / (unsigned char)baselinecorrec[i*(int)NCHAN_P+k]); + in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)((double)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]) / baselinecorrec[i*(int)NCHAN_P+k]); + } + } + } + + syslog (LOG_DEBUG,"baseline correction applied"); + } + + // close block after reading + ipcio_close_block_read (hdu_in->data_block, bytes_read); + if (DEBUG) syslog(LOG_DEBUG,"closed read block"); + + written = ipcio_write (hdu_out->data_block, (char *)(in_data), BUF_SIZE); + if (written < BUF_SIZE) + { + syslog(LOG_ERR,"write error"); + return EXIT_FAILURE; + } + + if (DEBUG) syslog (LOG_DEBUG,"write flagged data done."); + + + } + + free(lookup_rand); + return 0; +} diff --git a/legacy/gpu_flagger.cu b/legacy/gpu_flagger.cu new file mode 100644 index 0000000..07e6f5c --- /dev/null +++ b/legacy/gpu_flagger.cu @@ -0,0 +1,1547 @@ +// -*- c++ -*- +/*#include +#include +#include +#include +1;95;0c#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +*/ +#include +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" + +#include +#include + + +#define NTIMES_P 16384 // # of time samples (assuming 1ms sampling period) +#define NCHAN_P 1024 // # of channels on BF node side +#define NBEAMS_P 64 // # of beams on BF side +#define M_P NTIMES_P +#define N_P 32 +#define HDR_SIZE 4096 +#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet +#define NTHREADS_GPU 32 +#define MN 48.0 +#define SIG 6.0 +#define RMAX 16384 +//#define NPERMFLAGS 58 +#define NPERMFLAGS 1 +#define TBIN 128 +#define FBIN 8 + +// global variables +int DEBUG = 0; +//int flagchannels[58] = {737,738,753,754,721,722,723,724,725,726,727,728,729,627,628,629,630,631,632,633,634,603,604,605,606,607,608,609,610,578,579,580,581,582,583,584,585,590,591,592,593,594,595,596,597,598,680,681,682,683,684,685,686,687,688,327,328,329}; +int flagchannels[1] = {10}; +/* global variables */ +int quit_threads = 0; +int dump_pending = 0; +int trignum = 0; +char iP[100]; +char footer_buf[1024]; +char flnam[1024]; +int dumpbm; + +// structure for pulse injection +typedef struct { + + int verbose; + float * block; + +} dsaX_pulse_t; + + + + +// kernel to calculate median spectrum +// only works on =naver && thread_id<2*naver) { + + tid=thread_id-naver; + vec[thread_id] = v0[tid*NBEAMS_P*NCHAN_P + block_id]; + + } + + __syncthreads(); + + if (thread_id=naver && thread_id<2*naver) { + for (int i=naver;i<2*naver;i++) { + if (i!=thread_id) { + if (vec[i]<=vec[thread_id]) ct_lt++; + } + } + } + + __syncthreads(); + + + if (thread_id=naver && thread_id<2*naver) + if (ct_lt==place) v0[block_id] = vec[thread_id]; + +} + +// kernel to calculate mean spectrum +// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads +__global__ +void calc_spectrum(unsigned char *data, float * spectrum) { + + int block_id = blockIdx.x; + int thread_id = threadIdx.x; + __shared__ float csum[NTHREADS_GPU]; + csum[thread_id] = 0.; + + int bm =(int)( block_id/NCHAN_P); + int ch = (int)(block_id % (NCHAN_P)); + int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); + + // find sum of local times + int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; + for (int tm=0; tm0) { + csum[thread_id] += csum[thread_id+act_maxn]; + act_maxn = (int)(act_maxn/2); + } + } + */ + + if (thread_id==0) { + spectrum[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P); + } + +} + + +// kernel to calculate variance spectrum +// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads +__global__ +void calc_varspec(unsigned char *data, float * spectrum, float * varspec) { + + int block_id = blockIdx.x; + int thread_id = threadIdx.x; + __shared__ float csum[NTHREADS_GPU]; + csum[thread_id] = 0.; + + int bm =(int)( block_id/NCHAN_P); + int ch = (int)(block_id % (NCHAN_P)); + int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); + float val; + + // find sum of local times + int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; + for (int tm=0; tm0) { + csum[thread_id] += csum[thread_id+act_maxn]; + act_maxn = (int)(act_maxn/2); + } + }*/ + + if (thread_id==0) { + varspec[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P); + } + +} + +// kernel to calculate maximum value +// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads +__global__ +void calc_maxspec(unsigned char *data, float * maxspec) { + + int block_id = blockIdx.x; + int thread_id = threadIdx.x; + __shared__ float csum[NTHREADS_GPU]; + csum[thread_id] = 0.; + + int bm =(int)( block_id/NCHAN_P); + int ch = (int)(block_id % (NCHAN_P)); + int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); + float val=0.; + + // find max of local times + int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; + for (int i=idx0;ival) val = (float)(data[i]); + } + csum[thread_id] = val; + + __syncthreads(); + + // sum into shared memory + int maxn = NTHREADS_GPU/2; + int act_maxn = maxn; + if (thread_id0) { + if (csum[thread_id]val) val = vv; + } + csum[thread_id] = val; + + __syncthreads(); + + // sum into shared memory + int maxn = NTHREADS_GPU/2; + int act_maxn = maxn; + float v1; + if (thread_id0) { + if (csum[thread_id]0) { + if (csum[thread_id]>csum[thread_id+act_maxn]) + csum[thread_id]=csum[thread_id+act_maxn]; + act_maxn = (int)(act_maxn/2); + } + } + if (thread_id==0) + ppspec[bm*NCHAN_P+ch] = v1-csum[thread_id]; + +} + + +// kernel to scale data +// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads +__global__ +void scaley(unsigned char *data, float *spectrum, float *varspec) { + + int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; + int bm = (int)(idx / (NTIMES_P*NCHAN_P)); + int ch = (int)(idx % NCHAN_P); + int spidx = bm*NCHAN_P+ch; + + float val = (float)(data[idx]); + val = (val-spectrum[spidx])*(SIG/sqrtf(varspec[spidx])) + MN; + data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2); + + +} + +// kernel to add pulse to data +// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads +__global__ +void sumpulse(unsigned char *data, float *summand) { + + int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; + float val = (float)(data[idx]); + val += summand[idx]; + data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2); + +} + + + + +// kernel to make time series from data +// run with NBEAMS_P*NTIMES_P blocks of 32 threads +__global__ +void make_ts(unsigned char *data, float *ts) { + + int block_id = blockIdx.x; + int thread_id = threadIdx.x; + int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; + int bm = (int)(blockIdx.x/NTIMES_P); + int tm = (int)(blockIdx.x % NTIMES_P); + int ch0 = (int)(thread_id*(NCHAN_P/NTHREADS_GPU)); + + __shared__ float csum[NTHREADS_GPU]; + csum[thread_id] = 0.; + + // find sum of local chans + int idx0 = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch0; + for (int ch=0; chthresh) mask[i] = 1; + } + +} + + + +float medval(float *a,int n); + +float medval(float *a,int n) { + int i,j; + float tmp[n], tt; + for (i = 0;i < n;i++) + tmp[i] = a[i]; + + for(i = 0;i < n-1;i++) { + for(j = 0;j < n-i-1;j++) { + if(tmp[j] > tmp[j+1]) { + + tt = tmp[j+1]; + tmp[j+1] = tmp[j]; + tmp[j] = tt; + + } + } + } + + return tmp[(int)((n+1)/2-1)]; +} + +void channflag(float* spec, float Thr, int * mask); +void simple_channflag(float* spec, float Thr, int * mask); +void simple_tsflag(float* ts, float Thr, int * mask); + +void simple_channflag(float* spec, float Thr, int * mask) { + + int i, j; + float* medspec; // median values for each beam spectrum + float* madspec; // mad for each beam spectrum + float* normspec; // corrected spec - median value (for MAD calculation) + + medspec = (float *)malloc(sizeof(float)*NBEAMS_P); + madspec = (float *)malloc(sizeof(float)*NBEAMS_P); + normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + + int ZeroChannels = 128; + int nFilt, idx; + + // calculate median value for each beam + for (i = 0; i < NBEAMS_P; i++) + medspec[i] = medval(spec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels); + + // compute MAD for each beam + for (i = 0; i < NBEAMS_P; i++){ + for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ + normspec[j-ZeroChannels] = fabs(spec[i*NCHAN_P+j]-medspec[i]); + } + madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels); + } + + // mask + float vv; + float mythr = Thr/sqrt(1.*FBIN); + for (i = 0; i < NBEAMS_P; i++){ + + // implement FBIN + for (j = ZeroChannels; j < NCHAN_P-ZeroChannels-FBIN; j++) { + vv = 0.; + for (int k=0;k mythr*madspec[i]) mask[i*NCHAN_P+j] = 1; + + } + + } + + free(medspec); + free(madspec); + free(normspec); + +} + +void simple_tsflag(float* spec, float Thr, int * mask) { + + int i, j; + float* medspec; // median values for each beam spectrum + float* madspec; // mad for each beam spectrum + float* normspec; // corrected spec - median value (for MAD calculation) + + medspec = (float *)malloc(sizeof(float)*NBEAMS_P); + madspec = (float *)malloc(sizeof(float)*NBEAMS_P); + normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P); + + int nFilt, idx; + + // calculate median value for each beam + for (i = 0; i < NBEAMS_P; i++) + medspec[i] = medval(spec + i*NTIMES_P,NTIMES_P/16); + + // compute MAD for each beam + for (i = 0; i < NBEAMS_P; i++){ + for (j = 0; j < NTIMES_P/16; j++){ + normspec[j] = fabs(spec[i*NTIMES_P+j]-medspec[i]); + } + madspec[i] = medval(normspec,NTIMES_P/16); + } + + // mask + float vv; + float mythr = Thr; + for (i = 0; i < NBEAMS_P; i++){ + + for (j = 0; j < NTIMES_P; j++) { + + vv = spec[i*NTIMES_P+j]-medspec[i]; + if (vv > mythr*madspec[i]) mask[i*NTIMES_P+j] = 1; + + } + + } + + free(medspec); + free(madspec); + free(normspec); + +} + + +void channflag(float* spec, float Thr, int * mask) { + + int i, j; + float* baselinecorrec; // baseline correction + float* CorrecSpec; // corrected spectrum + float* medspec; // median values for each beam spectrum + float* madspec; // mad for each beam spectrum + float* normspec; // corrected spec - median value (for MAD calculation) + + baselinecorrec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + CorrecSpec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + medspec = (float *)malloc(sizeof(float)*NBEAMS_P); + madspec = (float *)malloc(sizeof(float)*NBEAMS_P); + normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + + + int ZeroChannels = 128; + int nFiltSize = 21; + int nFilt, idx; + + // calculate median filtered spectrum + for (i=0;i=nFiltSize) + CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+j,nFiltSize); + else + CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+NCHAN_P-ZeroChannels-nFiltSize,nFiltSize); + + } + } + + // calculate median value for each beam + for (i = 0; i < NBEAMS_P; i++) + medspec[i] = medval(CorrecSpec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels); + + // compute MAD for each beam + for (i = 0; i < NBEAMS_P; i++){ + for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ + normspec[j-ZeroChannels] = fabs(CorrecSpec[i*NCHAN_P+j]-medspec[i]); + } + madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels); + } + + // mask + for (i = 0; i < NBEAMS_P; i++){ + for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ + if (CorrecSpec[i*NCHAN_P+j] > Thr * madspec[i] || CorrecSpec[i*NCHAN_P+j] < - Thr * madspec[i]) + mask[i*NCHAN_P+j] = 1; + + // for permanent flagging + for (int kk=0;kk arr[(j+1)*stride+chan]) { + + tt = arr[(j+1)*stride+chan]; + arr[(j+1)*stride+chan] = arr[(j)*stride+chan]; + arr[(j)*stride+chan] = tt; + + } + } + } + + } + + for (int i=0;iai_family,res->ai_socktype,res->ai_protocol); + bind(fd,res->ai_addr,res->ai_addrlen); + memset(buffer,'\0',sizeof(buffer)); + syslog(LOG_INFO, "control_thread: waiting for packet"); + ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); + + syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); + strcpy(tbuf,buffer); + trignum++; + + // interpret buffer string + char * rest = buffer; + int tmp_dumpbm = (float)(strtof(strtok(rest, "-"),&endptr)); + if (tmp_dumpbm<0 || tmp_dumpbm>63) tmp_dumpbm=32; + char * tmp_flnam = strtok(NULL, "-"); + + if (!dump_pending) { + strcpy(flnam,tmp_flnam); + dumpbm = tmp_dumpbm; + syslog(LOG_INFO, "control_thread: received command to add pulse %s to beam %d",flnam,dumpbm); + if (!(fin=fopen(flnam,"rb"))) { + syslog(LOG_INFO,"cannot open %s",flnam); + } + else { + fread(tmpblock,sizeof(double),1024*16384,fin); + + // do manipulation of data + maxval = 0.; + for (int i=0;i<16384*1024;i++) { + if (tmpblock[i]>maxval) maxval = tmpblock[i]; + } + for (int i=0;i<16384;i++) { + for (int j=0;j<1024;j++) { + //ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]*2.*SIG/maxval); + ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]); + } + } + + fclose(fin); + syslog(LOG_INFO, "control_thread: finished processing pulse - setting dump_pending"); + } + } + + if (dump_pending) { + syslog(LOG_ERR, "control_thread: BACKED UP - ignoring %s",tbuf); + } + + if (!dump_pending) dump_pending = 1; + + close(fd); + + } + + free (buffer); + free (tbuf); + free(tmpblock); + + if (ctx->verbose) + syslog(LOG_INFO, "control_thread: exiting"); + +} + + +void usage() +{ + fprintf (stdout, + "flagger [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default dada]\n" + " -o out_key [default caca]\n" + " -t flagging threshold [default 5.0]\n" + " -f output spectra file\n" + " -g output beam power file\n" + " -n number of blocks in baseline spec aver (must be <=16 and >=1, default 5)\n" + " -p adjust noise level according to power\n" + " -m generate random data\n" + " -s time-series flagging and threshold [no default]\n" + " -q modulation index threshold for tot pwr flagging [default 0.0005]\n" + " -h print usage\n"); +} + + +int main(int argc, char**argv) +{ + + // syslog start + openlog ("gpu_flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // set cuda device + cudaSetDevice(1); + + // read command line args + + // data block HDU keys + key_t in_key = 0x0000dada; + key_t out_key = 0x0000caca; + + // command line arguments + int core = -1; + int arg = 0; + double thresh = 5.0; + float mod_thresh = 0.0005; + int naver = 5; + char * fnam; + char * fnam2; + FILE *fout; + FILE *fout2; + FILE *f0; + + fnam = (char *)malloc(sizeof(char)*200); + fnam2 = (char *)malloc(sizeof(char)*200); + int fwrite = 0; + int fwrite2 = 0; + int pwr = 0; + int mkrand = 0; + int tsflag = 0; + float tsthresh = 10.; + + while ((arg=getopt(argc,argv,"c:t:i:o:f:g:a:k:s:mdph")) != -1) + { + switch (arg) + { + case 'k': + strcpy(iP,optarg); + break; + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + strcpy(fnam,optarg); + fwrite = 1; + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'q': + if (optarg) + { + mod_thresh = atof(optarg); + break; + } + else + { + syslog(LOG_ERR,"-q flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'a': + if (optarg) + { + naver = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-a flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'g': + if (optarg) + { + //strcpy(fnam2,optarg); + sprintf(fnam2,"%s_%f.dat",optarg,40587.0+time(NULL)/86400.0); + fwrite2 = 1; + break; + } + else + { + syslog(LOG_ERR,"-g flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + thresh = atof(optarg); + syslog(LOG_INFO,"modified THRESH to %g",thresh); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 's': + if (optarg) + { + tsthresh = atof(optarg); + tsflag=1; + syslog(LOG_INFO,"TSTHRESH is %g",tsthresh); + break; + } + else + { + syslog(LOG_ERR,"-s flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'p': + pwr=1; + break; + case 'm': + mkrand=1; + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + dsaX_pulse_t udpdb; + udpdb.verbose = DEBUG; + float * pulsedata = (float *)malloc(sizeof(float)*256*16384*1024); + udpdb.block = pulsedata; + + // CONNECT AND READ FROM BUFFER + + dada_hdu_t* hdu_in = 0; // header and data unit + hdu_in = dada_hdu_create (); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to input buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to input buffer"); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO,"connected to input buffer"); + + uint64_t header_size = 0; + // read the header from the input HDU + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + + // mark the input header as cleared + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){ + syslog (LOG_ERR,"could not mark header as cleared"); + return EXIT_FAILURE; + } + + uint64_t block_id, bytes_read = 0; + unsigned char *in_data; + char *cin_data; + + // OUTPUT BUFFER + dada_hdu_t* hdu_out = 0; + hdu_out = dada_hdu_create (); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"flagged_data: could not connect to dada buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write (hdu_out) < 0) { + syslog (LOG_ERR,"flagged_data: could not lock to dada buffer"); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO,"connected to output"); + + + //// OUTPUT BUFFER + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + header_size = HDR_SIZE; + if (!header_out) + { + syslog(LOG_ERR,"couldn't read header_out"); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + return EXIT_FAILURE; + } + uint64_t written=0; + + if (DEBUG) syslog(LOG_INFO,"copied header"); + + //////////////// + + // declare stuff for host and GPU + unsigned char * d_data; + float * d_pulse; + unsigned char * h_bm0 = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P); + cudaMalloc((void **)&d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char)); + cudaMalloc((void **)&d_pulse, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float)); + unsigned char * h_data = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P); + int * h_mask = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P); + int * d_mask; + cudaMalloc((void **)&d_mask, NBEAMS_P*NCHAN_P*sizeof(int)); + int * h_tsmask = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P); + int * d_tsmask; + cudaMalloc((void **)&d_tsmask, NBEAMS_P*NTIMES_P*sizeof(int)); + float * d_spec, * d_oldspec; + cudaMalloc((void **)&d_spec, NBEAMS_P*NCHAN_P*sizeof(float)); + cudaMalloc((void **)&d_oldspec, NBEAMS_P*NCHAN_P*sizeof(float)); + float * d_ts; + cudaMalloc((void **)&d_ts, NBEAMS_P*NTIMES_P*sizeof(float)); + float * h_bpwr = (float *)malloc(sizeof(float)*NBEAMS_P); + float * d_bpwr; + cudaMalloc((void **)&d_bpwr, NBEAMS_P*sizeof(float)); + float * h_spec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float * h_ts = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P); + float * h_beam = (float *)malloc(sizeof(float)*NBEAMS_P); + float * h_bmask = (float *)malloc(sizeof(float)*NBEAMS_P); + float * h_subspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float * h_var = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float * h_max = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float * h_pp = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float * h_oldspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float *h_spec0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float *h_var0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); + float *d_spec0, *d_var0; + cudaMalloc((void **)&d_spec0, NBEAMS_P*NCHAN_P*naver*sizeof(float)); + cudaMalloc((void **)&d_var0, NBEAMS_P*NCHAN_P*naver*sizeof(float)); + for (int i=0;i>>(d_repval,time(NULL)); + for (int i=0;idata_block, &bytes_read, &block_id); + in_data = (unsigned char *)(cin_data); + gotDada=1; + blockn++; + } + else + in_data = (unsigned char *)(tmp_indata); + + // deal with bm0 + /*memcpy(h_data+NTIMES_P*NCHAN_P,in_data+NTIMES_P*NCHAN_P,(NBEAMS_P-1)*NTIMES_P*NCHAN_P); + memcpy(h_bm0,in_data,NTIMES_P*NCHAN_P); + memcpy(h_data,h_data+NTIMES_P*NCHAN_P,NTIMES_P*NCHAN_P);*/ + + + if (DEBUG) syslog(LOG_INFO,"read block"); + + /* + if not first block, correct data + 1 - measure spectrum + 2 - measure varspec + if first block, proceed. + else + 3 - measure maximum value + 4 - use three spectra to derive channel flags + 5 - flag + */ + + // copy data to device + cudaMemcpy(d_data, in_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyHostToDevice); + //cudaMemset(d_data, 8, NBEAMS_P*NTIMES_P*NCHAN_P); + + // if not first block, correct data + if (started==1 || prestart==1) + scaley<<>>(d_data, d_spec0, d_var0); + + if (DEBUG) syslog(LOG_INFO,"copied data and scaled"); + + // measure spectrum and varspec + calc_spectrum<<>>(d_data, d_spec); + calc_varspec<<>>(d_data, d_spec, d_var); + cudaMemcpy(h_spec, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(h_var, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); + if (started==0) { + for (int i=0;i= mod_thresh) { + + syslog(LOG_INFO,"mod_idx %f (threshold %f), noise replacement",fabs(tpwr-prev_tpwr)/prev_tpwr,mod_thresh); + + for (int i=0;i>>(d_data, d_max); + calc_ppspec<<>>(d_data, d_pp); + + // derive channel flags + cudaMemcpy(h_max, d_max, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(h_pp, d_pp, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); + for (int i=0;i>>(d_data, d_pulse); + syslog(LOG_INFO, "added %s to beam %d", flnam, dumpbm); + + dump_pending=0; + + } + + if (mkrand==0) + flag<<>>(d_data, d_idx, d_repval, d_bpwr); + + // ts flagging if needed + if (tsflag) { + + make_ts<<>>(d_data,d_ts); + syslog(LOG_INFO,"made ts"); + cudaMemcpy(h_ts, d_ts, NBEAMS_P*NTIMES_P*sizeof(float), cudaMemcpyDeviceToHost); + syslog(LOG_INFO,"copied ts"); + for (int i=0;i>>(d_data, d_tsidx, d_repval, d_bpwr); + syslog(LOG_INFO,"flagged ts"); + + } + + } + + } + + // deal with tpwr + prev_tpwr = tpwr; + + // copy data to host and write to buffer + cudaMemcpy(h_data, d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost); + + // deal with bm0 + //memcpy(h_data,h_bm0,NTIMES_P*NCHAN_P); + + // close block after reading + if (prestart==0) { + ipcio_close_block_read (hdu_in->data_block, bytes_read); + if (DEBUG) syslog(LOG_DEBUG,"closed read block"); + written = ipcio_write (hdu_out->data_block, (char *)(h_data), BUF_SIZE); + if (written < BUF_SIZE) + { + syslog(LOG_ERR,"write error"); + return EXIT_FAILURE; + } + } + + if (prestart==1) { + syslog(LOG_INFO,"Finishing with pre-start run-through"); + prestart=0; + + // search for spec0 and var0 file + if (f0=fopen("/home/ubuntu/data/specvar0.dat","r")) { + + //f0=fopen("/home/ubuntu/data/specvar0.dat","r"); + for (int i=0;i0) { + cudaMemcpy(d_spec0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); + cudaMemcpy(d_var0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); + } + if (blockn==0) { + cudaMemcpy(d_spec0, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); + cudaMemcpy(d_var0, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); + } + } + if (prestart==0 && gotDada==1 && blockn >= naver) { + started=1; + if (naver>1) fix_zspec<<>>(d_spec0, d_var0, naver); + cudaMemcpy(h_spec0, d_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost); + cudaMemcpy(h_var0, d_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost); + median_calc(h_spec0); + median_calc(h_var0); + cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice); + syslog(LOG_INFO,"writing out weights..."); + + // write out weights + f0=fopen("/home/ubuntu/data/specvar.dat","w"); + for (int i=0;i +#include +#include +#include + +int main() { + + cutlass::half_t x = 2.25_hf; + + std::cout << x << std::endl; + + return 0; +} +*/ + +#include +#include + +#include + +int main() { + + // Define the GEMM operation + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, // ElementA + cutlass::layout::ColumnMajor, // LayoutA + cutlass::half_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + cutlass::half_t, // ElementOutput + cutlass::layout::ColumnMajor, // LayoutOutput + float, // ElementAccumulator + cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores + cutlass::arch::Sm75 // tag indicating target GPU compute architecture + >; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::HostTensor A({M, K}); + cutlass::HostTensor B({K, N}); + cutlass::HostTensor C({M, N}); + + cutlass::half_t const *ptrA = A.device_data(); + cutlass::half_t const *ptrB = B.device_data(); + cutlass::half_t const *ptrC = C.device_data(); + cutlass::half_t *ptrD = C.device_data(); + + int lda = A.device_ref().stride(0); + int ldb = B.device_ref().stride(0); + int ldc = C.device_ref().stride(0); + int ldd = C.device_ref().stride(0); + // + // Launch GEMM on the device + // + + status = gemm_op({ + {M, N, K}, + {ptrA, lda}, // TensorRef to A device tensor + {ptrB, ldb}, // TensorRef to B device tensor + {ptrC, ldc}, // TensorRef to C device tensor + {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C + {alpha, beta} // epilogue operation arguments + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } else { + std::cout << "CUTLASS Success! " << std::endl; + } + + return 0; +} diff --git a/legacy/planar_complex.cu~ b/legacy/planar_complex.cu~ new file mode 100644 index 0000000..db94a64 --- /dev/null +++ b/legacy/planar_complex.cu~ @@ -0,0 +1,85 @@ +/* +#include +#include +#include +#include + +int main() { + + cutlass::half_t x = 2.25_hf; + + std::cout << x << std::endl; + + return 0; +} +*/ + +#include +#include + +#include + +int main() { + + // Define the GEMM operation + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, // ElementA + cutlass::layout::ColumnMajor, // LayoutA + cutlass::half_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + cutlass::half_t, // ElementOutput + cutlass::layout::ColumnMajor, // LayoutOutput + float, // ElementAccumulator + cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores + cutlass::arch::Sm75 // tag indicating target GPU compute architecture + >; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::HostTensor A({M, K}); + cutlass::HostTensor B({K, N}); + cutlass::HostTensor C({M, N}); + + cutlass::half_t const *ptrA = A.device_data(); + cutlass::half_t const *ptrB = B.device_data(); + cutlass::half_t const *ptrC = C.device_data(); + cutlass::half_t *ptrD = C.device_data(); + + int lda = A.device_ref().stride(0); + int ldb = B.device_ref().stride(0); + int ldc = C.device_ref().stride(0); + int ldd = C.device_ref().stride(0); + // + // Launch GEMM on the device + // + + status = gemm_op({ + {M, N, K}, + {ptrA, lda}, // TensorRef to A device tensor + {ptrB, ldb}, // TensorRef to B device tensor + {ptrC, ldc}, // TensorRef to C device tensor + {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C + {alpha, beta} // epilogue operation arguments + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } + + return 0; +} diff --git a/legacy/spectrometer_header.txt b/legacy/spectrometer_header.txt new file mode 100644 index 0000000..88a535c --- /dev/null +++ b/legacy/spectrometer_header.txt @@ -0,0 +1,38 @@ +ACC_LEN 1 +BANDWIDTH -250 +BW -250 +CFREQ 1405 +CHAN_AV 0 +DEC 00:00:00.000 +DSB 0 +FILE_SIZE 0 +FREQ 1405.000000 +FSCRUNCH 1 +HDR_SIZE 4096 +HDR_VERSION 1.0 +INSTRUMENT DSAX +MODE ACCUM +NBEAM 1 +NBIT 8 +NCHAN 1024 +NDIM 1 +NPOL 1 +N_PROD 1 +OBSERVER DSA +OBS_OFFSET 0 +OBS_UNIT SECONDS +OBS_VAL 0000.0000 +PID P000 +RA 00:00:00.000 +RECEIVER SANDY +RESOLUTION 4096 +SOURCE DSATEST +STATE Coherence +TELESCOPE DSA110 +TSAMP 262.144 +TSCRUNCH 1 +TRANSFER_SIZE 256000000 +NANT 5 +UTC_START 2015-08-07-17:07:28 +FILE_NUMBER 0 + diff --git a/legacy/splice_offline_beams b/legacy/splice_offline_beams new file mode 100755 index 0000000000000000000000000000000000000000..728af8c0f771d3c851b2b05a00201bb8672e361d GIT binary patch literal 32432 zcmeHwdw5jUx%b*L37KSanGnDrSO*+1D7kUJh$aX)@kB%}iWM9tlSwi%nZ&t3LOC^D zwB|r6X{qISYz5m>slCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R zp7TA=_s_Rs_FC&*zxTSn8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmwAyn>>v3?pVeBfwOoQe+PEYvZ7Y5s zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl+TXp08@tq>6C@%!3B5q~(eB}nqR zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8TkbHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+IlpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IAuFrmxO&YhZ~wdhLAEy!#2nsjZ}z~> zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~ zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3 zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF# zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L zca_wEE~QyS(Q%XfPr{^S6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4; z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs= zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW} zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~ zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT% zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz{{&!ul;o6 zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2 ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieug2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^ zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i4gi==D&!xLFV+qEb;CUh=e}`Uw@LW zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j; zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_ z4=KJRXf)R5=H<*{wB{# zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf zjPvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1 zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@Cc4mHub;XJni@LUc{4swI|z8 zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj z_jcpsY#c2^^5yX}X1h#qY}v zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP6y&@ z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~ z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8 zsMN98DK?jB(;S$Gi|ZijCLK#tI=rOA zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!R+lg9 z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCSSj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8 z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>MepJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kYNUVm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+*!je~i2Lxh5pz=S#iIU$@ zXZZ%~O5UJq3Hi4T6)DpY|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf z+{47z$Rv$CcEC*{#n^o&cE^L7_!N^sl zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f! zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U z$8Uj8##8B#djxirWkdX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^ zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0pXBEzFev3q*B48)FZZx7 zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D zQi63zj>}9`B|i! zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H` zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp> z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2t0xyC0<`H-vkk7u@}y2f z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PXB(xVCkHbT zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMePu^C2|l zc&*oriP^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NRvS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A z>3Qw1VSsC|L03X`a)_(Q(K@@}$YcrGi}IWf5PN&eendY zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f) z6u)}lRU|~!WKE~|tz0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*jh@cSar(vp{;zS z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR- zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e z0f%Tj482S zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$B*hiRoTyUSw)Q z{ijS|G%t1zqcL7!?5!L3{WJIP0knOl6as9MU_jb#8c z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@ zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS) z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cIx`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2 z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6 zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j* z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhhof%zh7nf8u!)p^(qzAUES%A z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye z{5aVe2)Bh3m|RkvluSm+3dKt}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^ znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~ zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u% zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdmNE0s5)fH{kbp#(HE>eNRIVCDuKqc{~_niWz1W}o~49L2pw|Jn_2%fkJu z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ< z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR zU=PiGL|aCbESCqQJV&b2u@8%50KOX3DrTu-cKU~l zzC$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~% z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2 z4iY(sE=xO0 zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQI+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{ zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^ z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCXicfma`AMwS zWC5od88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2 zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-< z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9 zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y&A<$j0-=*nn z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_ z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_ zXdI7Oye~wSAK7!-BBzY# zNCb7TUd=F3;?w7A_Um%cX-lcW?Leq~~}l5Z31Z* zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;? zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D zf6%EI=lcU-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yuMWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0n-1>PAc zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f z@W(pv>+s$PW92Yfm*lJL%Y41WaPH97Uxp*W5 zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1))ptinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`@SnX}F +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +FILE *output; + +void send_string(char *string) /* includefile */ +{ + int len; + len=strlen(string); + fwrite(&len, sizeof(int), 1, output); + fwrite(string, sizeof(char), len, output); +} + +void send_float(char *name,float floating_point) /* includefile */ +{ + send_string(name); + fwrite(&floating_point,sizeof(float),1,output); +} + +void send_double (char *name, double double_precision) /* includefile */ +{ + send_string(name); + fwrite(&double_precision,sizeof(double),1,output); +} + +void send_int(char *name, int integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(int),1,output); +} + +void send_char(char *name, char integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(char),1,output); +} + + +void send_long(char *name, long integer) /* includefile */ +{ + send_string(name); + fwrite(&integer,sizeof(long),1,output); +} + +void send_coords(double raj, double dej, double az, double za) /*includefile*/ +{ + if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); + if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); + if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); + if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); +} + +int main(int argc, char * argv[]) { + + // memory + uint64_t bsize = 2013265920, bls = 94371840; + unsigned char * allbeams = (unsigned char *)malloc(sizeof(unsigned char)*bsize); + memset(allbeams,0,bsize); + unsigned char * data = (unsigned char *)malloc(sizeof(unsigned char)*bls); + FILE *fin; + + // load in data if present + for (int i=0;i<16;i++) { + + if (strcmp(argv[i+1],"none")!=0) { + + fin=fopen(argv[i+1],"rb"); + fread(data,sizeof(unsigned char),bls,fin); + fclose(fin); + + for (int ibeam=0;ibeam<256;ibeam++) { + for (int itime=0;itime<15*512;itime++) { + for (int ich=0;ich<48;ich++) { + allbeams[ibeam*15*512*1024 + itime*1024 + i*48 + ich + 128] = data[itime*256*48 + ibeam*48 + ich]; + } + } + } + } + + } + + // make files + + char cmd[300], foutnam[400]; + sprintf(cmd,"mkdir -p %s_%s",argv[17],argv[18]); + system(cmd); + + for (int i=0;i<256;i++) { + + sprintf(foutnam,"%s_%s/%s_%d.fil",argv[17],argv[18],argv[18],i); + output = fopen(foutnam,"wb"); + + send_string("HEADER_START"); + send_string("source_name"); + send_string(argv[18]); + send_int("machine_id",1); + send_int("telescope_id",82); + send_int("data_type",1); // filterbank data + send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) + send_double("foff",-0.244140625); + send_int("nchans",1024); + send_int("nbits",8); + send_double("tstart",55000.0); + send_double("tsamp",8.192e-6*8.*4.); + send_int("nifs",1); + send_string("HEADER_END"); + + fwrite(allbeams + i*15*512*1024,sizeof(unsigned char),15*512*1024,output); + + fclose(output); + + } + + + free(allbeams); + free(data); + +} diff --git a/legacy/test_read.c b/legacy/test_read.c new file mode 100644 index 0000000..2b5730a --- /dev/null +++ b/legacy/test_read.c @@ -0,0 +1,279 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +#define S 4096 + +/* global variables */ +int DEBUG = 0; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -b connect to bf hdu\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("test_read", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // TESTING and initialization + // threads + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + + // data block HDU keys + key_t in_key = CAPTURED_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int nthreads = 1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + nthreads = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + + case 'q': + syslog (LOG_INFO, "Quit here"); + return EXIT_SUCCESS; + + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + return EXIT_FAILURE; + } + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t bytes_read = 0; + char * block, * output_buffer; + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + for (int i=0;idata_block, bytes_read); + + } + + + dsaX_dbgpu_cleanup (hdu_in,0); + +} + + diff --git a/legacy/test_write.c b/legacy/test_write.c new file mode 100644 index 0000000..32dd25d --- /dev/null +++ b/legacy/test_write.c @@ -0,0 +1,452 @@ +/* will reorder raw data for input to xgpu */ +#define __USE_GNU +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sock.h" +#include "tmutil.h" +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "ipcio.h" +// Forward declaration to keep compiler happy +// Possible minor bug in PSRDada +int ipcio_check_pending_sod (ipcio_t* ); +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_capture.h" +#include "dsaX_def.h" + +#include +#include +#include + +#define S 4096 + +// data to pass to threads +struct data { + char * in; + int n_threads; + int thread_id; + ipcio_t * out; +}; + +/* global variables */ +int DEBUG = 0; +int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); +int dada_bind_thread_to_core (int core); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) +{ + + if (write==0) { + + if (dada_hdu_unlock_read (in) < 0) + { + syslog(LOG_ERR, "could not unlock read on hdu_in"); + } + dada_hdu_destroy (in); + + } + + if (write==1) { + + if (dada_hdu_unlock_write (in) < 0) + { + syslog(LOG_ERR, "could not unlock write on hdu_in"); + } + dada_hdu_destroy (in); + + } + +} + +void usage() +{ + fprintf (stdout, + "dsaX_reorder_raw [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -t number of threads [default 4]\n" + " -b connect to bf hdu\n" + " -i input key [default CAPTURED_BLOCK_KEY]\n" + " -o output key [default REORDER_BLOCK_KEY]\n" + " -q quitting after testing\n" + " -h print usage\n"); +} + +/* thread for data massaging */ +void * massage(void *args) { + + // basic stuff + struct data *d = args; + int thread_id = d->thread_id; + + + // set affinity + const pthread_t pid = pthread_self(); + const int core_id = cores[thread_id]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (set_result != 0) + syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); + const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); + if (get_affinity != 0) + syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); + if (CPU_ISSET(core_id, &cpuset)) + if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); + + // extract from input data structure + char *in = (char *)d->in; + //char *out = (char *)d->out; + int nthreads = d->n_threads; + + // place in out + int i = thread_id*(S/nthreads); + //syslog(LOG_INFO,"thread %d: %d",thread_id,i); + memcpy (d->out->curbuf + i, in + i, S/nthreads); + + /* return 0 */ + int thread_result = 0; + pthread_exit((void *) &thread_result); + +} + + +// MAIN + +int main (int argc, char *argv[]) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("test_write", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // TESTING and initialization + // threads + struct data args[16]; + pthread_t threads[16]; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + void* result=0; + + /* DADA Header plus Data Unit */ + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + dada_hdu_t* hdu_out2 = 0; + + // data block HDU keys + key_t in_key = CAPTURED_BLOCK_KEY; + key_t out_key = REORDER_BLOCK_KEY; + key_t out_key2 = REORDER_BLOCK_KEY2; + + // command line arguments + int core = -1; + int nthreads = 1; + int bf = 0; + int arg = 0; + + while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) + { + switch (arg) + { + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + nthreads = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + + case 'd': + DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + + case 'q': + syslog (LOG_INFO, "Quit here"); + return EXIT_SUCCESS; + + case 'b': + bf=1; + syslog (LOG_INFO, "Will write to bf dada hdu"); + break; + + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + + // Bind to cpu core + if (core >= 0) + { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + + + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + + if (bf) { + header_out = ipcbuf_get_next_write (hdu_out2->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header2 block [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block2 filled [output]"); + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + return EXIT_FAILURE; + } + } + + + // record STATE info + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + uint64_t bytes_read = 0; + char * block, * output_buffer, * blockie; + output_buffer = (char *)malloc(sizeof(char)*block_out); + memset(output_buffer,1,block_out); + uint64_t written, block_id; + + // set up + + int observation_complete=0; + int blocks = 0; + int started = 0; + + + + syslog(LOG_INFO, "starting observation"); + + while (!observation_complete) { + + // open block + block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + if (started==0) { + syslog(LOG_INFO,"now in RUN state"); + started=1; + } + + // DO STUFF + + // sort out write + hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); + hdu_out->data_block->marked_filled = 0; + //blockie = ipcio_open_block_write (hdu_out->data_block, &block_id); + + // set up data structure + for (int i=0; idata_block; + } + + if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); + + for(int i=0; idata_block, output_buffer, block_out); + + // finish write + ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); + ipcio_check_pending_sod (hdu_out->data_block); + hdu_out->data_block->marked_filled = 1; + //ipcio_close_block_write(hdu_out->data_block, block_out); + + if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); + blocks++; + + + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + free(output_buffer); + + dsaX_dbgpu_cleanup (hdu_in,0); + dsaX_dbgpu_cleanup (hdu_out,1); + if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); + //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); + +} + + From 5e0ea2657f33084cd2910db09802af99c81bdc9d Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:42:49 -0700 Subject: [PATCH 08/30] Move headers --- {src => include}/dsaX_capture.h | 0 {src => include}/dsaX_capture_manythread.h | 0 {src => include}/dsaX_capture_pcap.h | 0 {src => include}/dsaX_def.h | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {src => include}/dsaX_capture.h (100%) rename {src => include}/dsaX_capture_manythread.h (100%) rename {src => include}/dsaX_capture_pcap.h (100%) rename {src => include}/dsaX_def.h (100%) diff --git a/src/dsaX_capture.h b/include/dsaX_capture.h similarity index 100% rename from src/dsaX_capture.h rename to include/dsaX_capture.h diff --git a/src/dsaX_capture_manythread.h b/include/dsaX_capture_manythread.h similarity index 100% rename from src/dsaX_capture_manythread.h rename to include/dsaX_capture_manythread.h diff --git a/src/dsaX_capture_pcap.h b/include/dsaX_capture_pcap.h similarity index 100% rename from src/dsaX_capture_pcap.h rename to include/dsaX_capture_pcap.h diff --git a/src/dsaX_def.h b/include/dsaX_def.h similarity index 100% rename from src/dsaX_def.h rename to include/dsaX_def.h From 7aca2bcd08885a4485b09da53d1d063bcb038c9f Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:44:27 -0700 Subject: [PATCH 09/30] Remove executables --- legacy/cuda_correlator | Bin 34272 -> 0 bytes legacy/dsaX_beamformer_passon | Bin 178600 -> 0 bytes legacy/dsaX_wrangle | Bin 99600 -> 0 bytes legacy/splice_offline_beams | Bin 32432 -> 0 bytes 4 files changed, 0 insertions(+), 0 deletions(-) delete mode 100755 legacy/cuda_correlator delete mode 100755 legacy/dsaX_beamformer_passon delete mode 100755 legacy/dsaX_wrangle delete mode 100755 legacy/splice_offline_beams diff --git a/legacy/cuda_correlator b/legacy/cuda_correlator deleted file mode 100755 index a8b94c759c2da5b87ab4c1a740138d0ad7d75073..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34272 zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(` z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9 zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N#bzs8E6%Repv1 z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z! z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$??970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1 zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq( zZnmAPm7Rxk^4+iSF@3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{ zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`} zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M( z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOqekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQpm1z(>(6sC{2`+fdIa681(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb> zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=> zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5; zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj z5t)3_f}d}Zf760rV8MmDj6SfV(!8@=bqCqKZ%@e>mW^Is&-E~j zFL`#waYOQVlV=wkuaNv5x>Ueeg*k+$qz}so;#)CC{!i z-XZx5$+OFhdnA7zd3KfYTFF5m_i{PX15f;Yc{u%OY;^X@z{}lQ8AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@ zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc zF7>X_c6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL) z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKTnJs+|?{ZecB{oiXDeX^A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^? zRQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9? z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%; zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JYG_yM6A|sEBegd(kk(X`Bt6R$N zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm z2IIz;q{gj~dK*%@MW*gUKC{D<+S zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d zOWpta#6;Ki57jEGkjs+UL7Cls9qs-R_{wk066`A1`3%cFNoj8J7DG z1%!qvbT@=#xhrvLk8sz6_B}=GH?9e zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@ zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^sjFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+ z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi% zuOGU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9 z@G$76w$azyMxSlJNe@y)MiHm)8wbs^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME( zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-* zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZo}I~| zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%& z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp>}Au{M*ohOdqgpn+LGU7)*qa zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~#F z!JWv}W>g-hTjJK}N6gsV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_ zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^ zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_* zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUvDr;Y$~9C zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W) zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$ z3ruwbqHN-BuUF#)zOl@jJ5zq}CD zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG) zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E< zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY z0F86c39B6UsIc*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT; z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwfMbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$ zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@ zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D zZ^iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK! z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$QsavN2%l6qvi?OXr zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3 zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69 z*@2{2Vd?x`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI1Q20jRX21C{Qe|%%5yso7;*yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+ zFer6ApuvZ!KF^dqsKATDknF)JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG* zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~- zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3` zm&>@*?O4zSe&t_Er%H#p}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52 zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=% zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5}hycR`?=#i$pSVt6CJc z-YoNClPodGQeif+>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1 zL~SOqbs#(p9-kCFQTmWW`QxLo3?E zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104 zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^ViwaoY z_gl8=I0ROgppw^ zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju zIR8QW+>4C?Zjs@A!1J|})d@;ddv*fk#isU`WKV)+2O1v zCP*iZt3%Sx};{3h^9QzYy71K|8_hn`^4w;4|> zTu*M{G|BKU1?;?5055^TeEC)Y&-Zm<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ) z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa>EHNZKZ3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE z5c|5hmn~h+38_3n^7}u5YhzFhAOq`O=g5 znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-jThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^ z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+ zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1 zp};maD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*!hEw zetKRNI3Cr~`g*u#uj@EjD1;kLk}kiN)AS~!^S9@k z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!) z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E} zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz| z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j3Qj~b>#4qH}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ- z9akcfGymN{u#cO&^ jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T diff --git a/legacy/dsaX_beamformer_passon b/legacy/dsaX_beamformer_passon deleted file mode 100755 index b08ed99873c198055c7e078c5d1cf0100e9af070..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 178600 zcmd3Pd0bL^AqF6`^PVuym!vM=iGD7 zJ@?#mmlX89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+ z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK) z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_ zaMtWuQ_nm1tl87fm_2K5Wz`u~L(eMu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs- z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-?? zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-je0pB=F~>67WAff&5ts z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaMrSttfVTZ$g#YwpWe*xsz66~ z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7 z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5( z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f zZB3XqyO2!S>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G zD^!tR5*$4Re}b?Cb?(M3?`)woE2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~; zbhGGEFofSkj#6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<; z1Fc}XEccfrnY$ zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?} zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^ z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un> z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa z#Rh(afuCvMFEj9E2Hre)tupWi-BKY;9Cv+)dqg6f!BM3EN(OK9LM4wgL z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j} z2|p^}sf3v-I;#XcfiP1;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR? z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ( zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4 zrZi5vfR822RK`gY@DYTW!Z6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+ z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1 zbE)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_h6!6)E zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz z$l0WOHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i= z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@ zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M zo9!<3~_9GzWHsD%;uh}Ha;hG*%s!>)Y#3OYwnq%rXr< zz#=a)JHy)NpU^*Jx06j%R+apl4 zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA; zuXzXQy{ffq^%puI^@>v);6T2$)hLdHhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70 z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia zxT%Nel>M;#sT&N^4wP=gV{%L`1708Exw9l3dII{C2&17(|V9;eB zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx zTrEUR!7=g<1zEedt1CZxQR zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD z3DsVc7w=gBI4WMp%Gn+1_E8O3&g2v89DaIJ6E zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^ z5Vu#WgrKdlu?Mm3~pJRAqQA!D6Jr} z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;PkL{f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y& z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e> zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-yz2gE1IrYFXbA8s z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R zf*#6N-BUjM!>-k!Q`5x9kiq9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~ z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>| zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_ zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S znF^jXAYME1Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1 z?sUNzu+*_GVNq!2Hgsic?L13(c7Yk&mrt z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ* z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#} z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0| zd+bxN|2_*l>Qk`SzCC5x3#>N-LRHJ+QYcAAi^r=Rt`fEj5e~F%yCH|V_0^MdU zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO& z5T^xp18U3iSuEej#&9C!a~{D3pTB^j3-RFW&lN|m|4D4>H69l? z^%S5G3t69i(RNN}62Ku>Qo#k1sFrF^$tG$x*1}yoQXfaUAR{ zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<e=l1~n z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10 zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE} zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&hECqpaFjLTT%YF(m94-{4*ZdR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw49kE(A-l5eLDnIv#OgLdJe#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L( zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3 z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0 z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqab zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)9l&62O3cpb{uoGq1Guu@FVU(&>D8u>ZWG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA zEd&PlJ?sNuz>}!-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8 zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3eG0vEow zHP(YFZUMOfbi&8mH`>ebZsgMi913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f z@i*&ufU+-ru=Wp(*9Xu4ZocT| zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E% zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7 zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8 z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=X=S2UV>mw~#{8dA2u0t-k zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+ z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb zn%-R~(7Ow={s1jn^gL_JbrxeUcX_N{BBJoDic=!h)> zf;?}vo!~2`p2J*u$Z=ggb+n3u%K&5h+eI#2foF(Bwrv z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2 zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG= z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5Mz>AtbgkCFCt zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7 z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y- zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2 zs<4-~(+9~Ey+j{kC+#UB zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5UrvG3)hRgaX5^o! z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~ zDBr(L=yyEJKk+EvCdG)w29HoWaozFL+~hC?^3@hHDb zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+hPP^6|1< z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BTy`=eDC%s-~XD>U&#NQNBK5cF67_g zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{#eF6{rFNBKfoF8pu3 zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^ zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT`d%h@Q<^fB4DLnfz0 z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ z{k^)jJ^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{ z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw96TmdeCo&Os4*d zK2HB96T`{-1~bq&M`3J5c{CJBEEpN_n==+dP9FjAE&>jALKzlk@SZCiat(% zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?cJzX$zt(#PV*Bt>uf z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe% z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH> z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8 z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(! zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8gSC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x zH!gpe-<+Jmt8uxJNf~J)acyQmKCX!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c zK&3yaN~epPMd7X_TNb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU z1=W(`u@pGjv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{ zblmp7@-o^hs88!M47+jiVPPU0m+O=!G{V!GHGvWph06 zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag` zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS z=t4fXT1mR?0MX58L`m8lu=w7kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1 zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKmlrTlgYmbPAqa!!vM?S�f{gAvg(Eu zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r* zvaK67htWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89 z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75< zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e> zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{> zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h33fym zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44 zuQk621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_> z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1 z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L) zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-rcL z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W2N3(rGtYVr`k?A=DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!#vh*%TH5%VNeqHiN4 zB9CM)85KDwapSltgq;*_o`@0|_vf<S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7)A$&>*6*`oB4fI zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@) zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|@;wQ;!Nt zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB z|2Ix}C`ZcXWv<}nrokdaSwhqrX^Jtx{-P) zoDc!mEeI4E(OJjSLQ8sYhnfPYYbg327#%6a7{ee@Qu+Sn0Pw8TP9=( z%OVK4Q=jdT)|5Z8c{_h*L6L9V( zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj>-O$ygMq0N7 zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a z9=pSp+033!WCQW7Xu`j$k zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O! z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@ zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMfo9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We` zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~ z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9& z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?qYFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD z(1@V1TM&U|7xSOFqb_JqY(V}*0YiBiy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH zb(FuUWOp8vzD+}Wi{A-ay|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW! z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c_=1Q2 zcbago7$fuImF%y5^ktS{o9Qq?2j{4 z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji zIQ5XrMO6XorATY+FsZd&SlA_&Qhu%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE& z*v($uLGfRk#hg|ioc_JcVv;NnB>#_iB>U(&LNm49G{u{HH z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz! zNJ#7a4c$r#6>&~%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u zCxA1m+~o5_D6W&wmI+8cuY=pk!g9b5QpVl$L2uRjb=TuUG>a zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6 z{C0EM zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5( znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg& z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n< zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2 zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I! znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7= zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJH8K zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${ zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL zfyT? zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~ zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`* zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?FM9r)2dV08UR3-C1;rCa)>CW4WOWa{& z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a3@4@4} zUI+8oc8S4*-V5tj@o~_L>%Z?HhYdjlCwj zuY4f`_l@A1@q_m;l-aG9xLL6 zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2 z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`lJD0qgVKH zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7nVRat z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt| z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB zu2YA)P7R!x{Q1z$JlLlWbvz?th6WE9X|LE)UCrEoo zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u-Vo$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g>b3#khu&@yEW6 z3DL=eXYdzT#mR%`@I=?igA?j$`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v? z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+ ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@Rm31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2 zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB zB@U<}c$AY9n>o!o&=l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV z@wyUjmH7l^J`D+F!tdCT0?x zS>d5jIy4Cd#cNKaptnN;$G)98TK^n7IFRCB;x_T|rs=fSIkdWs z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2 zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x|MAOA;u%nlVS^@#&u%Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8 z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4% za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8 zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3 z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>RMo7{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0 z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9 zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});> zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@ z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~Rluf9XF8b;hei=W zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9 zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U! zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4 zTh3S5dcMN8^A)Z*Ut#3Fl z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{ zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3 z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q| z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwHLOECuIRY|3L z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?) zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D| zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC( zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4 z^^d9gBNn}dgsF4^K6@%}4Yw3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4 zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta20p5Z=GT*pDOL--(nA)-%6cueq44L! z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y1>R&SG8` z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&> z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x2q!NbAud&FO4K9mam7WxPHtAqz z#8=Qkw*o78Y|FRF^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0 zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR6)u*a4ds_b0cO5 zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+ zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;) z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEcsav^{*99hPVLCFo5M+(ia+1be+ z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El zi(Uyw+PsFhfO~uTZoUN+=g>S z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^ocE7l6l2W`!0UFA@b|9FJnGiFm|~+ zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*Ia zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+ zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96 z$I+*uR8BMw?P(ks>L2C@GdYHhllcHPZ*)BvDO$b=r9lZ1 zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG! ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+ z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^ zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E& z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$ zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0 zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5 zJ?^Zo7Cx+!Y>iLT6E~R1pC34eJv$5j|5o{- z0{FjE`6}C{|5=rPrtPol@#pSSXtsC;YB)XHCp43DT)wVm50 zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3cKyJZ&H55#ZT4lZRk=g;_z)ub)C)= z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCNJ&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ( zb|)1-rnUF5gm}H}ov|8H_YuX$V)0L?_NDA(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y! zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL*G>r*oR6V*; zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?< z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$ zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U( z0C?~A4A(|E)_+hR_81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@yXjngCUsbsv>!Y9aQP!Wr$|Zgf zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf>zQ`W+zV*3k?_BsN zO7q3=z5w;lPyGWwsedeA;`aon*1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X z0qAXn{Rp-n57`gKyQOB3ii-Qx%388u6IyqSty_8hNU=`B_E z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz= z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@p% zDuVv|F|gyJ{==Wt=0q7RMTB)pKk%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw zAO(YYOaD>AzZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2 zN6FWKKV}&CFGF>4B@FhD{)`F`Dkx3- z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hdpll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO& zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO z1Ku0VbJ91<@w-}FRRv9;{?dFG z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>LC(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@ zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9B(f8d zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$= zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1 zKO_FD`ALZSKZC!GZQYhft@2n-%9r^Cc^YO! ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl z!wHWOt~7@CxS^#=^J%c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l|51_vgvPgu@_c_=A^hZT$tKA^ z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw z2<vOLDvweY{m`&eCqOVL1sDS77 zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr z<=?Ml{i!u$dHPA~g(M1|zrx` zBkb@O@R3}M=Qnac5%YmuK&B!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{ zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1THj)#3xD*Gm}I z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z# zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF zk^H|8>sb`8Grz&Vqh`E&BOaUBEEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&-LF7lv# zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7_TmP7UUO9bY{w#((=NHghRN=Ho_I8m& zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^JTAoB%I6f& z<8k`K_JHY=4UaFILont%4?Wb?9o zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+ zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{ z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A zsWt5K=Oz6l5EuP!dYFDZ0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO zB>fIOOn;^H_rM|bqhtXw)S~R4x)S*A-(9i8hF&d~#*#j|{+wUiS z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f z4u9nOECaAUl*CA2vj5f8e`dJT{}@eYClfbO`0nfkx z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89? z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA zDVh(@?UyiiLiRUA==!%Qt zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!-b;Rg{<_>MhHqE;kND7u z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L zGBSM%l#|LSdiMcVE1dXEnOv&u;m{qO1+>=F2*IgY=T{-T1$Z}t}$ zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^ z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4mu_Q~nC@ zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s z^^QG}y}0bT_8Q3NJofnFA=G!-NG>7^S z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B zz0mmVbLrmRA5+D! z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+ z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j zbP76OY?%6-ISh|9)~jmF1qWpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8& z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8 zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{ zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&420q>( z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjkos_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS z!&F{gAx@|?8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX& zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L z&_2i>9%LMy zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5oAsOY2l{KMG|Lo z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@ zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ zI}QDfD|q(D(BBc2o=tzTt|;|i>F{+?6m+4Q$o z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9 zckiX+fPg1{u>Elk6 zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6 zKc#%WM3x^xIrr4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$= z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT z`z6-Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7 z*7;#-1oEWyBEBe%^`<#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT z)c1U%E_MiI=7Yh z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6 zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw% zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~ z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8 zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(! ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+- z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{ zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z% zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I* zc%Mvx1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^ z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M&@OL^|A`&}|+-QD$A z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w z`i%O4{l6|AS%^;@jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55 zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~ zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7RbVvq!7C z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH& z_<<&X?;hUMIQj$evIcI{^+5qrjk(K2St z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6% zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2} zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjUY`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K zbet&rIezk7%wPFwt>pcBH1TnK6QkV(p5dWMWnw;YwAzLzktoAp?*!=rZ7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA-8J7HZ;?;;kmM4>@yM zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9 zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr` zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9| zL~v11|9T+2IOy{{>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENOKBJkiuYYD-cpO@B?I?XTea z;zRz%OyWyIUtd!4C87OBh|fjzw-gh`CjeTK=F?w+I61N zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP*iFtI(d4k2Pk%%}f1zt*UUPiPuG ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?Traa=`eLW2Z1+Ux1oSsjTor^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>UouSx6?%s_UKi!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8 z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv z!2c`}NueW<{%MK-x24*KfEXwXKQtVTqt2)OKkU7CbX3(IKl zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?OjwOklc6Q zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9 zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`= zK<^rMK(~DTP{vGAG&wK~}Xrknd zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3 zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wRE=WVFjSzg zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr) zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB6Qy1ru~=%J0|`e4?Z0Nobi6bsccnu zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG79CeUe8p0AVPMJpz^l-115-}%O&mDtMOa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}aOJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i z-n|}I}i!(G}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_ z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE; z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C z33S+w4oGh{6xqqDe~SniuFr za-N+($++Dv^e+Fe9=eBlfvHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3 zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1 zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i-m zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2 z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2 z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)? zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3! zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac z97E0TOG-g8$VVw2$WAbv(KagxP&)tAt@AksheCB8Bs5 zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L z#9vbQR=6bIb)cTvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{< zB@g8Pbh#t{bRlKaGDOObwYnpR;?T z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3 z=`mZ5@K~u+$-{q?19MZM-JxpbKPFXB(Y`cyhu z5kV+sY-1{YEZN|J?Tpw>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ` zOZwE_@W&_>K?8FwuOcOsA(ivjk8()vGU+es z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_ zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8 zln!&t6nHwD_abZK6ha>QjxEeJ$ti}Ir*t^hdrwY0g{ZtYJB5bⅈLtVZ+07i53u_G~b3r)s!SYf}Ev4 zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y68i2k)}w)z9{EiU1c5!5Sq2C+L` zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6 z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf< zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0 zUs5tj6|2~xY-NbAsFy+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$- zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z> z(OMU>SV9DGq`?-EC?yZUhb?keaJDncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM* z&%}|O%Q1CKNFOe*LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}d z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0IMT1GVbE1245kFbQ zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ? zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4Vu<(0hIWkISeG_ zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5 zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH z%5m^9ql!X29yH3aDCpy0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps z~GyqK)-rtlkV3XSa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db zCQP0!=~*nr75v@cE%O+Xp>c6Rwt&KEtVujihn3E$Ie$CMITPntJd1%9OXfL$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K; zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValaztR0UBMjSstghdsJ5T`|HT)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{< zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@ z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr ziPjej===WcG~#oI`F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj z9jyI0gjXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$tJX{{}a1S*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx08g5f~%!6D8S7fN)?H1>{_f&I23 zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92 zA@^*>SZK@$XU!@L9UNx$dG2urK^;3Bs@$p># z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7 zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%! zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+ z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%lqL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78 zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3 zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gpzJF0i5MQ{| z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN# zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~ zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g>b$LSV}&CV?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo& zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+ z^E!QFNo%*!DKg5lZ7 zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~ zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9 zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUgi8%a>RY{2B@NiA-%DU>> z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOaw@_+-#@jr-WYF$P{~@;l(bJ zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G* zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3 zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@ zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@ zO@>euGu|$}Od3^S|lxH(uMnag^+% z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n z?Hnh90Ev8l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv zs`^z`r?r|bw3-qzJ63c8G zcgVTJeR-E#Ru09xNXg7(u#+L;_CeYH)JUe|I;yrfEf zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K( zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD3_u%8pdQZ7`11JDWA`+_NM~4 zHNoc_*U;2@`s7s4xf55Sf?C)O!!63;zj7NN(Di%UW%vK=0lDGfVC zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN zPT&&Y-Z5?MJAnJfwzaw6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v- za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;>*wxDfd3l`sz+b`8t} zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d! z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm5TkT|- z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@sZSsJ64xFng)S21#){K_oxF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO# zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ z3;cfIe%-NtK6ZDjaqM%4I#$P zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(ErY(qYK|xAk#7 z~KyJm5+Cvp*`vU zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ z+W$a&$=iDtIoAJ)V*z>5`*1i>eRFRL( zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%FLC0O)}rU0*u zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He z8*Ib>TfzSf{286hPmP&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7} zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0 zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+VkDG4=KjX5V^XFFZPXT{WNArsf<3{W$MDVU z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~ zk8F4>eI^| z;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5OguzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5` zW=~st-%k9zxIeHEOfO=spu2Pv@%=LRORzqjg#YdYKl%Qi z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9Ozx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb! zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW zHb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#( z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm|*5URUG%o2z>v2uI0fkIx? z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@* zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuTPFgkA@?~!~zqGO{_3+j)@Ty zSDLuN#4AnQZQ_F_?lAr~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0 z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{ zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7 zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ z)(E552_V5Ak)8TO^cQ*yvb+55HXKB?tWP zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@ zyYPSz@f?PVB?=>5OBTEKSp_ray9Rv-eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8 z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0 zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`JV;gPjR&Z8;*tJnU9VYp8U^4ZM zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r-IWir% z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r?%UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^ zlVm3CJZG+w#Fdgz8| zZ-G&!&4qfOAwNaCURv>pg>@70gnM;rt6QUyBYYXF4b$>xc`tm5 z_x4PjeA(Tf9K1{tr5N&mhHAoim&vA-~ zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}? z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY} z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWVX$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv zVN{={Bes&)eJjD z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9 zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv& zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#* z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e; zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y z#z7NMAsAD}rqH@irl`H>U78A{Z%oV~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo! zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o}Z*U zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X)z-N~b} zFxPiGS|(u^UelQF<{T^B=CE$d`NRJ($HxC?!-N$!qr+pupF2!{MivkB+R^8AwZu;LWzsA{6*xwg)M#st* z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1 z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r zoWF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($ z7L)EY*RPXI`6o`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4 zJXe|WCHIvXdNH>sp4(0NSoz<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X ze(2PwJ+biq#k9ZIEm3-GRnPVT|}wWj=*lcVzg zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7b(a|``4S{ zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq| zf1@d1aY7yhdH+U3Cj9B`6Q!GE z8iDd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9 z6u)+@;2S=0gyEb;R+VX52~P| zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo- z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae< zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^ zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr? z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`VqSHz@u?4A+~`>7NqdKhk`@ z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2 z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ% zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a-> z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ zSa(Q6@z9{?L^j@nMGdIRE-s zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9f^=n(23=jJTe zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66 zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0 z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7 zSoIc`s(F9OE8@-mhUd3doIdAv|hpUy*FYks3qt)vRA zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)& zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{} zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV= zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{ z@Zo@0aKJ6#(*omvV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn3lYA-UDTN!4pqi{ukuP}2gx9uys_<+ea&bv6tCXM)t5 zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${ z)QCMkh_AaM-WXT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7 zn+ApWi)FaPcA;ivuvkE8 z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7tAOQN3AY{^m8 z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M) zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^a`;BfI z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e> zTWdTCb<(m8XcW8*ubp5i?uWw z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$ zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~; z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+ zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p z8)XIC=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;FXv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~ zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX zIZ3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQeo+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkww9^3@(?ul8Dk4L!#M&`0^PrpD ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE03?uwb!Y6Yj%W@MY0CYL?N=!)V z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3 zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0 zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$ zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonPei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0X5I&g{hWH!n{?&8Anw zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5 zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg zM+k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y& zAUFnXo1^}h*Y zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~ z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^ zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w) z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C z4c|I66g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`} z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA* zPq}siGxH68?DpRa7-5?6+rNH52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*43GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ z&%Uxx%D>Yf>e2v`9W;hLBeHUPH S<)8BzNqgaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_ z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z| z(<};uPnN~pEqRK^a#^}umM&+ehL+zpKb-~~KW0i=ev01rm-Bk{ddcpU zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;= zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>} z;A^&6gah&aTO9hiap=?H(7zG~zA6rU zS{(S&R_Wy9G9>_mO#*uSg9QscIKe9h_hLExU}IN-9dL zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_ z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM; z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq# zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=- z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8 zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg zYZmjf-T^^a=;IEz`GsrLmcq113tj2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6* z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_ zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V zlE0aGhNy^J^4BxZ&=kp({58xU#=IfzQZhiF8W-Qsx7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N ze-?R$fJnFGKVzOrqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~ zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^ zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT zV}Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8( zai zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3rsd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{ zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0# z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE z;G+aqAus$7u#3E{khk?mt|#&VAcKXBG}TiMu?!Azj}fl1 zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~ zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H zMzJ@Pas*QEGYDyBR`F|1vRTzEQc zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)mBF zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@ z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>kaANH99`*Z{vw+4#a+C4jn^R|1! zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mcawU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#zXO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7 zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^BE24?QEuNhTw$hV5 zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7 z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc z1`TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp z|0Fv`egG4xcCL?gAVB#dl9UeGZr^f7Js>{^F;Z*cl9 znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5 z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt zezsAuiP{ofwgf$|z#m1Zpr_Y+s;8s= zX-ld0yu28zwC7FgmoIYW^{`&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^ z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ ze)pJD2 zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0` zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?| zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|BY0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o z4U+j7r4uYBH>G_4Op3_kO z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xfPXn7!@ICCtW(kE? zC9u_OSs{<>4s2#9T4X&Cx<&N4oTpkZG(Ly zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R*LE83z!nesI@>8@N&BpGU-1t*vU&eE99vXkJn z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_* z=<(qn&}^?zwz+G8{h``ne8 zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf?? zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ovz;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg< z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d** zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~kQb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTxhaXjTHTk09IPq0}! zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55 z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7-&IFU%k-cX@}EYDW(jbSTiU; zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5 z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C* z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G& z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3 z^11uOcwV=iE!yv%gkz}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m# zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(an$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~ z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH= z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v0QeJ+SzF zDEl-W5_WywYl*+`24&HHdzIUh`aW_8k@UoVD78Q{W6RDjPK*l>4hnK zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl6FD~C^X)}>R+h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG z{}W9#n+BWB8nS7z*!3}KL$*27e9fNknzh%>N7H^y32%?fev zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7% zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y zm)E1`ni-BIP7BvBuOMy=NEUB~SD5pGH3NUgW zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo&e>RJARM}so`d+g* zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55 zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JAQaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`RU0;^c*s=Wt-e*9oE#UK3k z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K z#NPH^IsgXR?xC{ z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX zS-V>AG;uJ!8^?P3-l>1Uvv6*Z zOeTw6@Mp!MS=aAcSG;@WDlB$S9jF+?B<|W<*Ua#Q-9x?W z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1 ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7 zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pURV3jR0nY&1+CC z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8% zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27 zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+ zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0 zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n_rK|0Jfw{&qWKU#MO6SUa1eE(RG z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9 zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=uSmv#gB6|(p(ld9K|P^ zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66 zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl zQI}iI=O~ud)+DLZm zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2 z9KGe^MFW~5mGzusx_WLOfrI7L8}Bzc zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+ zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa zhaQJQqolRyA7g09$DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|ju~zL^ zO78FN)#30IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjjy?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w zGQ9g9yQ)cMRc2JwDT=5W zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z( zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx= z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!swfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r! zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58 z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M z54A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm-~ZSJwN_iOTloXp zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_5k%~3^ZbDb*f?fa)i!> zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9 z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e ztxeFe-|*hOD zIM@YH2MV(iG*rNIYjFDN0ZWg6wo4Db@ z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~ zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ zUx(7Cs`#dIg}nrG48lpMfaK#!L3}tzChl zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(c!f~JUb%q!1YUU^$Y9Yfv? z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv z1KutBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4 zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2! zd#(e)AbJ0Fwjr!wVHQmz#nmws5?agmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|% zD_b)&@O4JkCE@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r z;uHA7@#&eY5qQ17sqy_0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1* zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3iar%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l z<))}|JaVJ>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f z@f~YC3W9E@Gt!iwNUnmQ=d)JX1+ zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K= zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|! z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ# z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus z;~I;5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6 z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraCh21|sFabPB`K+jX9A~OM_$e7Ss z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i zwFPA}K=xxoniLpS6L=if0GU|4d;=vY-GTf z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;72$ zd0^h+{ua%kp+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7tksy@&4=3Fn@MBIw z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk> zyahXe%?j%@d^f6`wGRt!A8uWP_B4a zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{ zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0< zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KDoJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;; zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C* zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL z7KG})b6uSojYg%u(1rM0=6}{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g^eVx25m)tpui>mG74ML zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC19aC<= zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v zlG-Fx(qCWdhUt9`M%gJ-^eASkwW4YnS?~Y8J3Lw6WRQ z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@| z7;wJ&(lW|2VZsD6=PN(LC9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x< z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~ zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V< zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{ zwQTzgKtHg7wEWU06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4&r_tQ#Ek9v-QM@Z92|j?Wie# zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A% zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=L3#AgRgTA*-tOUe`c(_jlsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx- z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o# zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X_uaT&LqwQ0C-hfw7k z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKKz7Oz6L+QKy*Hg0(nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx)SGAVQ1Ip68YDkYpfQieS-8h+S0W zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU zHzGYA=?BfSgh4x}$4-GlUaJPbbyMqh$-3exM4 zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~* z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs) z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$H`1niP#%Z8e?sa;y5L@v zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh zX))5ZNPma46RB@YG}?ghyzq+9-u@<_8@MtK}=*C5S7 zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_* z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3 zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw- z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C* zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9 zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%` z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8 ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!= z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&>qErhJQ4z6tnW0$&<~zr@0?27VLpf zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6 z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRrcs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7 z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l zzgXG84cHpsHv;ePFT=D&xUFdC4&Yw{ewg0e6phY~DX$g{Y`^D$H`?OkcL9Gg z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7 zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa z&!2y*QaWq!3@1mieIKAoC?TWI`r zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3} z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!# z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr-1HfexOrj_5b`gt=p{&DF<#^r?O|C zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU2mwjA zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0 zA)vZGAmd%~xuqE#ve3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6 zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I= z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{VEjy?_*kdlmPjgdbC`!lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_ z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p* zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdPqdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw z9mO;6g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;os=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0VGqfEq>_(xD_22_R{rTk59m z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2 zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;< zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0 zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^ znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D zx3+7seOH;*zOTSrO}Pd(#{?)cEV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53p9L8os*dfM$IcB#qVb8x6) zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M; z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2 z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`= z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h z8t^-)q8k8wlPU^utNv&zvgWznTiOmeoe3#ik(xw7? zi^HLcI0zA`iVmyO(i}b}4z<%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj) zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZmOgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1) z%Lt|rHF^y8VONdRGe9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~ZA|XG?gi{bVAq}ZrTeiX37S# zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo z6>52R1E{(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@SOK@Go zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6 z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu> zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w?s%@S7CcX*gi_TYyYe=C^I_29qE_7wR* zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=0V{SI2PU_`9=FdRAz6X_r@bbj z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym} zGmgv6Nm=`-&*o#Z7Ar{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$ zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@II!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN zCK;~+{Sr@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65 zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR> zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h* zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U} zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2smtz@PrZS2kV+4E=}qHa?7Aq5tqptG!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*> zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c ziLq83dpIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!jlG8HH(Wx= zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62| z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6) zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJGHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h? zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{ejN4VF4tjAwm3XnF zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;( ztlTpSv>`;y=$I8{NDp!!I!4b5J<`ZCC}{)O4Y z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+BjW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q# z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8 z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR^(B?)u^H9}U*-(WcOuXVclfatIxv zp|`EzVh4$KYUClS`M;k=o=vas;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2 zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{ zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6 zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<) zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOIYFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9 zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@ zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@ z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l# z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H? z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7 zn>2+s?4mD{wwNl&K-?V>2!ZQeC}?sMce$tEZ24 zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O0Vo)&`oXe&0Ve9 z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)dJDVaxShcF(xr@t8JL2r;f zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*Me*Abu0QRdd=By;}e# zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGDBp;F_4hU14L2NT&@ykNiS35v0GP<{# z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9 z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828 z4yHz0&=)o7q2Y)~a?`FfB=F)u(3wnr#NW7ZOE6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy} zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=oymA3#i)t0=sI6iW(kt+8h;7 z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z zn=#^~I+-`f1u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3 za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1 zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<; zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7 z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2 zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh(=Ah6UD zl^rfaO7UNL`q-So}Nug;rxC{Q{9Qy5jq+4Px;XmQti9oGXZd{+&3| zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38 zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX zuK;~bPRH?lK9SFB_$)VdiLUZJc%~4J7zGIsbk>%j46> zIQdm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW** z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay` zo{hQormbts5jSE&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_} z8pg~1vq{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl z2LK#0oSp$}q2G&Ho;;na)C)LggBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3 z-!gvG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k zCxw!~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4 z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>dz&8=gM4L6 zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?) z=BckSpNr2lgy%5*eHzaR&*|A5_*aW}A zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC7y@DH*+=QI8l9w^&b z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ zfbIWXCYjIp-**~BA@fPh4Ae&2Y! zpZPy|qG8O_nEC=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe# z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O z9Q>~WehlJzx%p{`;8_egPI>Sfi z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93 zobhKf{v@`mZ6;fFGG1^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO; zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W}Te4Chb3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5 z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV zvr`Y|;PVXA7r$=k+5M}Z0p8T8#dLnEnEs=q z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+ z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ< zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2 z677gWUoW@kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0 z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR zCe7^i(ABoe;!~g`Zhd}Zr5>%RU%-S z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z zqQ0e1C|$|sSr|ZY` zYpTqw(bGwer-^Rq?cXGt@eKiPxKtNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7 zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d& z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P* z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1% zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~ z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D zCt}PyDO1$kIx!^#MhysC-wK&@aZF398nkF_OhUkn@X? zwBo8EJ*nkG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79 zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0 zgx-7yASw6!ua$slCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R zp7TA=_s_Rs_FC&*zxTSn8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmwAyn>>v3?pVeBfwOoQe+PEYvZ7Y5s zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl+TXp08@tq>6C@%!3B5q~(eB}nqR zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8TkbHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+IlpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IAuFrmxO&YhZ~wdhLAEy!#2nsjZ}z~> zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~ zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3 zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF# zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L zca_wEE~QyS(Q%XfPr{^S6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4; z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs= zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW} zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~ zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT% zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz{{&!ul;o6 zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2 ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieug2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^ zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i4gi==D&!xLFV+qEb;CUh=e}`Uw@LW zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j; zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_ z4=KJRXf)R5=H<*{wB{# zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf zjPvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1 zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@Cc4mHub;XJni@LUc{4swI|z8 zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj z_jcpsY#c2^^5yX}X1h#qY}v zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP6y&@ z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~ z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8 zsMN98DK?jB(;S$Gi|ZijCLK#tI=rOA zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!R+lg9 z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCSSj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8 z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>MepJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kYNUVm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+*!je~i2Lxh5pz=S#iIU$@ zXZZ%~O5UJq3Hi4T6)DpY|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf z+{47z$Rv$CcEC*{#n^o&cE^L7_!N^sl zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f! zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U z$8Uj8##8B#djxirWkdX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^ zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0pXBEzFev3q*B48)FZZx7 zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D zQi63zj>}9`B|i! zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H` zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp> z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2t0xyC0<`H-vkk7u@}y2f z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PXB(xVCkHbT zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMePu^C2|l zc&*oriP^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NRvS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A z>3Qw1VSsC|L03X`a)_(Q(K@@}$YcrGi}IWf5PN&eendY zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f) z6u)}lRU|~!WKE~|tz0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*jh@cSar(vp{;zS z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR- zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e z0f%Tj482S zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$B*hiRoTyUSw)Q z{ijS|G%t1zqcL7!?5!L3{WJIP0knOl6as9MU_jb#8c z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@ zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS) z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cIx`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2 z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6 zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j* z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhhof%zh7nf8u!)p^(qzAUES%A z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye z{5aVe2)Bh3m|RkvluSm+3dKt}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^ znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~ zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u% zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdmNE0s5)fH{kbp#(HE>eNRIVCDuKqc{~_niWz1W}o~49L2pw|Jn_2%fkJu z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ< z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR zU=PiGL|aCbESCqQJV&b2u@8%50KOX3DrTu-cKU~l zzC$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~% z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2 z4iY(sE=xO0 zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQI+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{ zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^ z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCXicfma`AMwS zWC5od88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2 zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-< z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9 zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y&A<$j0-=*nn z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_ z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_ zXdI7Oye~wSAK7!-BBzY# zNCb7TUd=F3;?w7A_Um%cX-lcW?Leq~~}l5Z31Z* zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;? zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D zf6%EI=lcU-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yuMWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0n-1>PAc zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f z@W(pv>+s$PW92Yfm*lJL%Y41WaPH97Uxp*W5 zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1))ptinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`@SnX}F Date: Sat, 15 Jun 2024 22:45:28 -0700 Subject: [PATCH 10/30] Add CUTLASS example --- src/dsaX_cutlass_interface.cu | 315 ++++++++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 src/dsaX_cutlass_interface.cu diff --git a/src/dsaX_cutlass_interface.cu b/src/dsaX_cutlass_interface.cu new file mode 100644 index 0000000..fc68d55 --- /dev/null +++ b/src/dsaX_cutlass_interface.cu @@ -0,0 +1,315 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "dsaX_cutlass_interface.h" + +DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); +} + +// DMH: Replace this with data from DSA-FTD +void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { + + if(testing) { + uint64_t seed = 1234; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } else { + // DMH: construct DSA-FTD interface data transfer interface + } + + ptr_A = tensor_A.get(); + ptr_B = tensor_B.get(); + ptr_C = tensor_C.get(); + ptr_D = tensor_D.get(); + + batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + +} + +Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { + + Result result; + + initialize(); + + // Configure pointers in global memory + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + cudaError_t error; + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); + if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); + + } + } + + + cudaEvent_t events[2]; + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Run profiling loop + //------------------- + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + for (int iter = 0; iter < options.iterations; ++iter) { + + result.status = handle.gemm_planar_complex_array( + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // Compute reference in device code + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + // Define the GEMM through templates + GemmPlanarComplex + (problem_size, options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = BlockCompareRelativelyEqual + ( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) std::cout << "Reference check passed." << std::endl; + else std::cerr << "Error - reference check failed." << std::endl; + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; +} + + int main(int argc, char const **args) { + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + Options options; + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Compute GEMM + DSA_FTD_ComplexGEMM_CUTLASS gemm(options); + gemm.testing = true; + Result result = gemm.run(options); + + return result.passed ? 0 : -1; +} + From 6c3532322a416fba2af5cd58562b70046cae63e9 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:46:15 -0700 Subject: [PATCH 11/30] Remove code from src --- src/Makefile | 208 ---- src/cuda_correlator | Bin 34272 -> 0 bytes src/dsaX_beamformer.cu | 1128 --------------------- src/dsaX_beamformer.cu.wrk1 | 1003 ------------------- src/dsaX_beamformer_offline.cu | 933 ----------------- src/dsaX_beamformer_passon | Bin 178600 -> 0 bytes src/dsaX_beamformer_passon.cu | 1057 -------------------- src/dsaX_bigfake.c | 320 ------ src/dsaX_capture.c | 1080 -------------------- src/dsaX_capture_manythread.c | 1115 --------------------- src/dsaX_capture_manythread.c.bak | 1053 -------------------- src/dsaX_capture_pcap.c | 852 ---------------- src/dsaX_capture_thread.c | 1107 --------------------- src/dsaX_copydb.c | 273 ----- src/dsaX_cuda_correlator.cu | 309 ------ src/dsaX_dbnic.c | 435 -------- src/dsaX_dbnic.c.bak | 381 ------- src/dsaX_fake.c | 320 ------ src/dsaX_filTrigger.c | 559 ----------- src/dsaX_fluff.c | 415 -------- src/dsaX_makeFil.c | 276 ----- src/dsaX_merge.c | 580 ----------- src/dsaX_nicdb.c | 483 --------- src/dsaX_nicdb.c.bak | 434 -------- src/dsaX_reorder.c | 515 ---------- src/dsaX_reorder_raw.c | 613 ------------ src/dsaX_reorder_raw.c.bak | 672 ------------- src/dsaX_reorder_raw.c.bak2 | 608 ------------ src/dsaX_simplesplit.c | 362 ------- src/dsaX_splice.c | 201 ---- src/dsaX_split.c | 601 ----------- src/dsaX_splitup.c | 285 ------ src/dsaX_store.c | 218 ---- src/dsaX_testdada.c | 161 --- src/dsaX_trigger.c | 585 ----------- src/dsaX_wrangle | Bin 99600 -> 0 bytes src/dsaX_wrangle.c | 378 ------- src/dsaX_wrangleAndWrite.c | 365 ------- src/dsaX_writeFil.c | 486 --------- src/dsaX_writevis.c | 428 -------- src/dsaX_xgpu.cu | 375 ------- src/dumpfil.c | 294 ------ src/fil2dada.c | 521 ---------- src/flagger.c | 484 --------- src/gpu_flagger.cu | 1547 ----------------------------- src/spectrometer_header.txt | 38 - src/splice_offline_beams | Bin 32432 -> 0 bytes src/splice_offline_beams.c | 132 --- src/test_read.c | 279 ------ src/test_write.c | 452 --------- 50 files changed, 24921 deletions(-) delete mode 100644 src/Makefile delete mode 100755 src/cuda_correlator delete mode 100644 src/dsaX_beamformer.cu delete mode 100644 src/dsaX_beamformer.cu.wrk1 delete mode 100644 src/dsaX_beamformer_offline.cu delete mode 100755 src/dsaX_beamformer_passon delete mode 100644 src/dsaX_beamformer_passon.cu delete mode 100644 src/dsaX_bigfake.c delete mode 100644 src/dsaX_capture.c delete mode 100644 src/dsaX_capture_manythread.c delete mode 100644 src/dsaX_capture_manythread.c.bak delete mode 100644 src/dsaX_capture_pcap.c delete mode 100644 src/dsaX_capture_thread.c delete mode 100644 src/dsaX_copydb.c delete mode 100644 src/dsaX_cuda_correlator.cu delete mode 100644 src/dsaX_dbnic.c delete mode 100644 src/dsaX_dbnic.c.bak delete mode 100644 src/dsaX_fake.c delete mode 100644 src/dsaX_filTrigger.c delete mode 100644 src/dsaX_fluff.c delete mode 100644 src/dsaX_makeFil.c delete mode 100644 src/dsaX_merge.c delete mode 100644 src/dsaX_nicdb.c delete mode 100644 src/dsaX_nicdb.c.bak delete mode 100644 src/dsaX_reorder.c delete mode 100644 src/dsaX_reorder_raw.c delete mode 100644 src/dsaX_reorder_raw.c.bak delete mode 100644 src/dsaX_reorder_raw.c.bak2 delete mode 100644 src/dsaX_simplesplit.c delete mode 100644 src/dsaX_splice.c delete mode 100644 src/dsaX_split.c delete mode 100644 src/dsaX_splitup.c delete mode 100644 src/dsaX_store.c delete mode 100644 src/dsaX_testdada.c delete mode 100644 src/dsaX_trigger.c delete mode 100755 src/dsaX_wrangle delete mode 100644 src/dsaX_wrangle.c delete mode 100644 src/dsaX_wrangleAndWrite.c delete mode 100644 src/dsaX_writeFil.c delete mode 100644 src/dsaX_writevis.c delete mode 100644 src/dsaX_xgpu.cu delete mode 100644 src/dumpfil.c delete mode 100644 src/fil2dada.c delete mode 100644 src/flagger.c delete mode 100644 src/gpu_flagger.cu delete mode 100644 src/spectrometer_header.txt delete mode 100755 src/splice_offline_beams delete mode 100644 src/splice_offline_beams.c delete mode 100644 src/test_read.c delete mode 100644 src/test_write.c diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index 0de1991..0000000 --- a/src/Makefile +++ /dev/null @@ -1,208 +0,0 @@ -# This is set up for the CORR containers - -CC=gcc -CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -CDEPS1=dsaX_def.h dsaX_capture_manythread.h -CDEPS2=dsaX_def.h dsaX_capture.h -LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu - -#LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap -#CDEPS3=dsaX_def.h dsaX_capture_pcap.h - -CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++ -CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 - - -.DEFAULT_GOAL := all - -test_write.o: test_write.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -test_write: test_write.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -test_read.o: test_read.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -test_read: test_read.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_trigger.o: dsaX_trigger.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_trigger: dsaX_trigger.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_filTrigger.o: dsaX_filTrigger.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_filTrigger: dsaX_filTrigger.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -splice_offline_beams.o: splice_offline_beams.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -splice_offline_beams: splice_offline_beams.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_store.o: dsaX_store.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_store: dsaX_store.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_fluff.o: dsaX_fluff.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_fluff: dsaX_fluff.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_reorder.o: dsaX_reorder.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_reorder: dsaX_reorder.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_dbnic: dsaX_dbnic.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_nicdb.o: dsaX_nicdb.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_nicdb: dsaX_nicdb.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_capture.o: dsaX_capture.c $(CDEPS2) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_capture: dsaX_capture.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_capture_thread.o: dsaX_capture_thread.c $(CDEPS2) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_capture_thread: dsaX_capture_thread.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_capture_manythread.o: dsaX_capture_manythread.c $(CDEPS2) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_capture_manythread: dsaX_capture_manythread.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_split.o: dsaX_split.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_split: dsaX_split.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_merge.o: dsaX_merge.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_merge: dsaX_merge.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_simplesplit.o: dsaX_simplesplit.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_simplesplit: dsaX_simplesplit.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - - -dsaX_fake.o: dsaX_fake.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_fake: dsaX_fake.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_splitup.o: dsaX_splitup.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_splitup: dsaX_splitup.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_copydb.o: dsaX_copydb.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_copydb: dsaX_copydb.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_writevis.o: dsaX_writevis.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_writevis: dsaX_writevis.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_wrangle.o: dsaX_wrangle.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_wrangle: dsaX_wrangle.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_testdada.o: dsaX_testdada.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_testdada: dsaX_testdada.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_writeFil.o: dsaX_writeFil.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_splice.o: dsaX_splice.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_writeFil: dsaX_writeFil.o - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_reorder_raw.o: dsaX_reorder_raw.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dsaX_reorder_raw: dsaX_reorder_raw.o $(CDEPS1) - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -fil2dada.o: fil2dada.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -fil2dada: fil2dada.o $(CDEPS1) - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dumpfil.o: dumpfil.c $(CDEPS1) - $(CC) -c -o $@ $< $(CFLAGS1) - -dumpfil: dumpfil.o $(CDEPS1) - $(CC) -o $@ $^ $(CFLAGS1) $(LIBS) - -dsaX_xgpu: dsaX_xgpu.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -cuda_correlator: cuda_correlator.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -gpu_flagger: gpu_flagger.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -dsaX_beamformer: dsaX_beamformer.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -dsaX_bfCorr: dsaX_bfCorr.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -dsaX_beamformer_passon: dsaX_beamformer_passon.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -dsaX_beamformer_offline: dsaX_beamformer_offline.cu - $(CCU) -o $@ $^ $(CFLAGS2) $(LIBS) - -.PHONY: clean all - -clean: - rm -f *.o *~ dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_beamformer_offline dsaX_splice dsaX_filTrigger cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge - -all: dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_capture_thread dsaX_capture_manythread dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_filTrigger dsaX_beamformer_offline dsaX_splice dsaX_splitup cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge - - - - diff --git a/src/cuda_correlator b/src/cuda_correlator deleted file mode 100755 index a8b94c759c2da5b87ab4c1a740138d0ad7d75073..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34272 zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(` z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9 zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N#bzs8E6%Repv1 z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z! z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$??970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1 zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq( zZnmAPm7Rxk^4+iSF@3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{ zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`} zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M( z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOqekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQpm1z(>(6sC{2`+fdIa681(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb> zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=> zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5; zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj z5t)3_f}d}Zf760rV8MmDj6SfV(!8@=bqCqKZ%@e>mW^Is&-E~j zFL`#waYOQVlV=wkuaNv5x>Ueeg*k+$qz}so;#)CC{!i z-XZx5$+OFhdnA7zd3KfYTFF5m_i{PX15f;Yc{u%OY;^X@z{}lQ8AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@ zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc zF7>X_c6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL) z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKTnJs+|?{ZecB{oiXDeX^A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^? zRQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9? z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%; zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JYG_yM6A|sEBegd(kk(X`Bt6R$N zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm z2IIz;q{gj~dK*%@MW*gUKC{D<+S zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d zOWpta#6;Ki57jEGkjs+UL7Cls9qs-R_{wk066`A1`3%cFNoj8J7DG z1%!qvbT@=#xhrvLk8sz6_B}=GH?9e zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@ zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^sjFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+ z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi% zuOGU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9 z@G$76w$azyMxSlJNe@y)MiHm)8wbs^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME( zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-* zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZo}I~| zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%& z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp>}Au{M*ohOdqgpn+LGU7)*qa zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~#F z!JWv}W>g-hTjJK}N6gsV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_ zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^ zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_* zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUvDr;Y$~9C zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W) zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$ z3ruwbqHN-BuUF#)zOl@jJ5zq}CD zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG) zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E< zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY z0F86c39B6UsIc*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT; z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwfMbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$ zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@ zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D zZ^iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK! z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$QsavN2%l6qvi?OXr zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3 zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69 z*@2{2Vd?x`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI1Q20jRX21C{Qe|%%5yso7;*yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+ zFer6ApuvZ!KF^dqsKATDknF)JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG* zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~- zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3` zm&>@*?O4zSe&t_Er%H#p}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52 zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=% zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5}hycR`?=#i$pSVt6CJc z-YoNClPodGQeif+>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1 zL~SOqbs#(p9-kCFQTmWW`QxLo3?E zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104 zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^ViwaoY z_gl8=I0ROgppw^ zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju zIR8QW+>4C?Zjs@A!1J|})d@;ddv*fk#isU`WKV)+2O1v zCP*iZt3%Sx};{3h^9QzYy71K|8_hn`^4w;4|> zTu*M{G|BKU1?;?5055^TeEC)Y&-Zm<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ) z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa>EHNZKZ3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE z5c|5hmn~h+38_3n^7}u5YhzFhAOq`O=g5 znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-jThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^ z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+ zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1 zp};maD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*!hEw zetKRNI3Cr~`g*u#uj@EjD1;kLk}kiN)AS~!^S9@k z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!) z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E} zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz| z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j3Qj~b>#4qH}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ- z9akcfGymN{u#cO&^ jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu deleted file mode 100644 index afdda70..0000000 --- a/src/dsaX_beamformer.cu +++ /dev/null @@ -1,1128 +0,0 @@ -// -*- c++ -*- -/* will implement the 64-input beamformer - -does N beams of 256 - -order is (taking time as 8x 8.192e-6) -[2048 time, 63 antennas, 768 channels, 2 pol, r/i] -Load in 16 times at a time, so that we have (in units of what needs to be added) -[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i] - -This should be reordered on the cpu to -[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i] - -The first kernel, launched with 1536 blocks of 64 threads, needs to - - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. - - each thread processes 4 beams, adding everything. for each beam, - + for each chunnel and pol, calculate weights using cal weights and ant positions, - + add everything into output array -Output array has order [beam, 96 frequency, 16 time] - -Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights - -Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. -these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). -after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. - -Do everything in floating point until second kernel. - -Second kernel will simply add times and adjacent channels and pick leading 8 bits -Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn. - - */ - -#define THRUST_IGNORE_CUB_VERSION_CHECK - -#include -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "cuda_fp16.h" -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include -#include - -#include -using namespace nvcuda; - -// global variables -int DEBUG = 0; -const float sep = 1.0; - -// kernel for summing for online bp -// input array has order [beam, 48 frequency, 2 pol, 16 time] -// need to output to [beam, 48 frequency] -// run with 256*48=12288 blocks and 32 threads -__global__ -void badder(float *input, float *output) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 256*48=12288 - int tidx = threadIdx.x; // assume 32 - //int fidx = 2*(bidx % 24); - int beamidx = (int)(bidx / 48); - - // declare shared mem - volatile __shared__ float data[32]; // data block to be summed - - // transfer from input to shared mem - data[tidx] = input[bidx*32+tidx]; - - // sync - __syncthreads(); - - // complete sum - if (tidx<16) { - data[tidx] += data[tidx+16]; // over pols - data[tidx] += data[tidx+8]; - data[tidx] += data[tidx+4]; - data[tidx] += data[tidx+2]; - data[tidx] += data[tidx+1]; - } - // now tidx = 0, 4, 8, 12 are what we want! - - __syncthreads(); - - // store - if (tidx == 0) - output[bidx] += data[0]; - -} - - -// kernel for summing and requantizing -// input array has order [beam, 48 frequency, 2 pol, 16 time] -// need to output to [4 time, beam, 48 frequency] -// bp is scale factor for each beam -// run with 256*48=12288 blocks and 32 threads -__global__ -void adder(float *input, unsigned char *output, float *bp) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 256*48=12288 - int tidx = threadIdx.x; // assume 32 - //int fidx = 2*(bidx % 24); - int beamidx = (int)(bidx / 48); - - // declare shared mem - volatile __shared__ float data[32]; // data block to be summed - - // transfer from input to shared mem - data[tidx] = input[bidx*32+tidx]; - - // sync - __syncthreads(); - - // complete sum - if (tidx<16) { - data[tidx] += data[tidx+16]; // over pols - data[tidx] += data[tidx+2]; - data[tidx] += data[tidx+1]; - } - // now tidx = 0, 4, 8, 12 are what we want! - - __syncthreads(); - - // store - if (tidx == 0) - output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); - if (tidx == 4) - output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); - if (tidx == 8) - output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); - if (tidx == 12) - output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); - -} - -// kernel for promotion -/* -orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] -input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] -output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] -promoted to half precision - -launch with 16*48*NANT blocks of 32 threads - - */ -__global__ void promoter(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume 16*48*NANT - int tidx = threadIdx.x; // assume 32 - int iidx = bidx*32+tidx; - int pol = (int)(tidx % 2); - int chunnel = (int)(tidx / 2); - - /*int ant = (int)(bidx % NANT); - int time_chan = (int)(bidx / NANT); - int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ - - int chan = (int)(bidx % 48); - int time_ant = (int)(bidx / 48); - int tim = (int)(time_ant / NANT); - int ant = (int)(time_ant % NANT); - int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; - - //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); - //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); - inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); - ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); - -} - -// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels -// for first time, launch with 3072, 32 -__global__ void printer(half *inr, half *ini) { - - int idx = blockIdx.x*32+threadIdx.x; - float ir = __half2float(inr[idx]); - float ii = __half2float(ini[idx]); - - int chunnel = (int)(threadIdx.x % 16); - int channel = (int)(blockIdx.x/64); - int tt = (int)(blockIdx.x % 64); - int pol = (int)(tt/32); - int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); - - if (ir!=0. || ii!=0.) { - printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); - } - -} - -// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels -// launch with 4,32 -__global__ void rms_printer(half *inr, half *ini) { - - int idx = blockIdx.x*32+threadIdx.x; - int pol = (int)(idx / 64); - int ant = (int)(idx % 64); - - float rms = 0., val; - for (int i=0;i<16;i++) { - - idx = 786432 + 49152 + pol*64*16 + ant*16 + i; - - val = __half2float(inr[idx]); - rms += val*val; - val = __half2float(ini[idx]); - rms += val*val; - - } - rms = sqrt(rms/32.); - - printf("ANTPOL_RMS %d %d %f\n",ant,pol,rms); - -} - - - -// kernel for beamforming -/* - -Assumes that up to NANT antennas (nominally 63) are populated. - -Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) - -Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di - -Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. -for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) -use __float2int_rn, cosf, sinf intrinsics. - -Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. -Do it in tiles of 16 beams and 16 ants for - -Output array has order [beam, 48 frequency, 2 pol, 16 time] - -inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag -wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - -launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization - = 24576 blocks - -*/ -__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 24576 - int tidx = threadIdx.x; // assume 32 - int orig_bidx = (int)(bidx / 16); - int beam_tile = (int)(bidx % 16); - int stuff_tile = (int)(beam_tile % 4); - int data_offset = orig_bidx*1024; // offset for first part of data - int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight - weight_offset *= 16384; - int idx1, idx2; - int f_idx = (int)(orig_bidx % 96); - int tim_idx = (int)(orig_bidx / 96); - int oidx = f_idx*16 + tim_idx; - - // shared memory for convenience - __shared__ half summr[16][16]; // beam, chunnel - __shared__ float summi[16][16]; // beam, chunnel - - // accumulate real and imag parts into [16 beam x 16 f] fragments - // Declare the fragments. - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment wr_inr_frag; - wmma::fragment wr_ini_frag; - wmma::fragment wi_inr_frag; - wmma::fragment wi_ini_frag; - wmma::fragment ib_frag; - wmma::fragment final_frag; - - - // zero out accumulators - wmma::fill_fragment(wr_inr_frag, 0.0f); - wmma::fill_fragment(wr_ini_frag, 0.0f); - wmma::fill_fragment(wi_inr_frag, 0.0f); - wmma::fill_fragment(wi_ini_frag, 0.0f); - wmma::fill_fragment(ib_frag, 0.0f); - - // IB - if (stuffants==2) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - - } - - } - - // one ant per beam - if (stuffants==1) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - - } - if (stuffants!=1) { - - // loop over ant tiles - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - // copy weight and data to fragments, and multiply to accumulators - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); - - wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); - - wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); - - } - - // form real and imaginary matrices - for(int i=0; i < wr_inr_frag.num_elements; i++) { - wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real - wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag - wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared - } - } - - // at this stage the matrices are [beam, chunnel], and need to be summed over columns - - __syncthreads(); - - // copy back to shared mem - half *p1; - float *p2, tmp; - p1 = &summr[0][0]; - wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - if (stuffants!=1) { - - // now do thread reduction using multiplication by unity - wmma::fill_fragment(final_frag, 0.0f); - wmma::fill_fragment(b_frag, 1.0f); - wmma::load_matrix_sync(a_frag, p1, 16); - wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - // store - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; - } - - - } - - if (stuffants==1) { - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; - } - } - if (stuffants==2) { - - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); - tmp = 0.; - for (int i=0;i<16;i++) tmp += summi[i][i]; - if (tidx==0 && beam_tile==0) - output[(beam_tile*16+tidx)*1536 + oidx] = tmp; - - } - -} - -// kernel to calculate weights - needed because weights are halfs -// launch with 256 threads in 6144 blocks -__global__ -void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { - - // assume 256 threads in 6144 blocks - int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile - int tidx = threadIdx.x; - int f = (int)(bidx / 128); - int cc = (int)(bidx % 128); - int pol = (int)(cc / 64); - cc = (int)(cc % 64); - int beam_tile = (int)(cc / 4); - int ant_tile = (int)(cc % 4); - int beam_i = (int)(tidx / 16); - int ant_i = (int)(tidx % 16); - - int beam = beam_tile*16+beam_i; - int ant = ant_tile*16+ant_i; - int i = bidx*256+tidx; - int widx = ant*NW*2*2 + f*2*2 + pol*2; - - float theta = sep*(127.-beam*1.)*PI/10800.; // radians - float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate - float twr = cos(afac*antpos[ant]); - float twi = sin(afac*antpos[ant]); - - wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); - wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); - - -} - - -// function prototypes -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); -int init_weights(char *fnam, float *antpos, float *weights, char *flagants); -void reorder_block(char *block); -void calc_bp(float *data, float *bp, int pr); -void calc_allbp(float *data, float *bp); -void ret_med_bp(float *bp); -void ret_many_bp(float *many_bp, float *bp); - -// performs massive summation to calculate bp -// input array has order [beam, 96 frequency, 16 time] -// bp has size 48 - no way to avoid strided memory access -// returns factor to correct data -void calc_bp(float *data, float *bp, int pr) { - - int i=0; - - for (int b=0;b<256;b++) { - for (int f=0;f<48;f++) { - for (int a=0;a<32;a++) { - bp[b] += data[i]; - if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); - i++; - } - } - } - -} - -void calc_allbp(float *data, float *bp) { - - int i=0; - - for (int st=0;st *(const float*)elem2; -} - -void ret_med_bp(float *bp) { - - qsort(bp, 256, sizeof(float), cmpfunc); - float medval = 0.5*(bp[127]+bp[128]); - for (int i=0;i<256;i++) - bp[i] = medval; - -} - -void ret_many_bp(float *many_bp, float *bp, float medbp) { - - for (int i=0;i<256;i++) { - bp[i] = 0.; - for (int j=0;j0.1) - bp[i] = medbp; - } - -} - -// performs cpu reorder of block to be loaded to GPU -void reorder_block(char * block) { - - // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] - // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] - // 24576*NANT in total. 1536*NANT per time - - char * output = (char *)malloc(sizeof(char)*24576*NANT); - - for (int i=0;i<16;i++) { // over time - for (int j=0;j= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - int nints = NPACKETS / 16; - uint64_t nbytes_per_int = block_size / nints; - uint64_t nbytes_per_out = block_out / nints; - char * block; - unsigned char * output_buffer; - output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // allocate host and device memory for calculations - //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag - //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - char *d_indata[NSTREAMS]; - unsigned char *d_outdata[NSTREAMS]; - float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; - half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; - float *d_added[NSTREAMS], *h_added; - h_added = (float *)malloc(sizeof(float)*256*48*NSTREAMS); - cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions - cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights - cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs - cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass - cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight - cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight - cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); - - float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); - char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); - float *bp = (float *)malloc(sizeof(float)*256); - float *frozen_bp = (float *)malloc(sizeof(float)*256); - float *many_bp = (float *)malloc(sizeof(float)*256*NBP); - int bpctr = 0; - float medbp; - unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); - - // streams and device - cudaStream_t stream[NSTREAMS]; - for (int st=0;st d1(d_inr[st]); - thrust::fill(d1, d1+16*48*2*64*16, 0.0); - thrust::device_ptr d2(d_ini[st]); - thrust::fill(d2, d2+16*48*2*64*16, 0.0); - } - - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - int blockct = 0; - int slow_down = 0; - int prestart = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - blockct ++; - - // DO STUFF - - // calc weights - init_weights(fnam,antpos,weights,flagants); - cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); - calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); - if (DEBUG) syslog(LOG_INFO,"Finished with weights"); - - // zero out d_added - for (int st=0;st>>(d_indata[st], d_inr[st], d_ini[st]); - - // do printing if needed - if (bst==0 && slow_down==0) - rms_printer<<<4, 32, 0, stream[st]>>>(d_inr[st], d_ini[st]); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // run badder kernel - badder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_added[st]); - - // if sufficient bandpasses... - if (started>0) { - - // run adder kernel - adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); - - // copy to host - cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); - - // copy to output - for (int j=0;j<12288*4;j++) { - if (test_pattern) - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); - else - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; - } - if (DEBUG && bst*NSTREAMS+st==10) { - for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); - } - - } - - } - } - - // now deal with bandpass - - // copy to host - for (int st=0;st0 && bpctr0 && bpctr>=NBP) { - - //syslog(LOG_INFO,"now using many BPs for requant"); - - // do average bp - ret_many_bp(many_bp,bp,medbp); - - started=2; - - } - - - - // finally deal with bp - for (int i=0;i<256;i++) { - - if (AGC==0) - for (int i=0;i<256;i++) bp[i] = frozen_bp[i]; - - if (bpctr<15) syslog(LOG_INFO,"coeff %d %d %g",bpctr,i,bp[i]); - if (bp[i]!=0.) { - bp[i] /= 48.*nints; - bp[i] = 2.5*128./bp[i]; - } - } - cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); - - bpctr++; - slow_down++; - if (slow_down>=20) slow_down=0; - - // write to output - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - for (int st=0;st -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "cuda_fp16.h" -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include -#include - -#include -using namespace nvcuda; - -// global variables -int DEBUG = 0; - - -// kernel for summing and requantizing -// input array has order [beam, 48 frequency, 2 pol, 16 time] -// need to output to [4 time, beam, 48 frequency] -// bp is scale factor for each beam -// run with 256*48=12288 blocks and 32 threads -__global__ -void adder(float *input, unsigned char *output, float *bp) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 256*48=12288 - int tidx = threadIdx.x; // assume 32 - //int fidx = 2*(bidx % 24); - int beamidx = (int)(bidx / 48); - - // declare shared mem - volatile __shared__ float data[32]; // data block to be summed - - // transfer from input to shared mem - data[tidx] = input[bidx*32+tidx]; - - // sync - __syncthreads(); - - // complete sum - if (tidx<16) { - data[tidx] += data[tidx+16]; // over pols - data[tidx] += data[tidx+2]; - data[tidx] += data[tidx+1]; - } - // now tidx = 0, 4, 8, 12 are what we want! - - __syncthreads(); - - // store - if (tidx == 0) - output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); - if (tidx == 4) - output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); - if (tidx == 8) - output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); - if (tidx == 12) - output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); - -} - -// kernel for promotion -/* -orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] -input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] -output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] -promoted to half precision - -launch with 16*48*NANT blocks of 32 threads - - */ -__global__ void promoter(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume 16*48*NANT - int tidx = threadIdx.x; // assume 32 - int iidx = bidx*32+tidx; - int pol = (int)(tidx % 2); - int chunnel = (int)(tidx / 2); - - /*int ant = (int)(bidx % NANT); - int time_chan = (int)(bidx / NANT); - int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ - - int chan = (int)(bidx % 48); - int time_ant = (int)(bidx / 48); - int tim = (int)(time_ant / NANT); - int ant = (int)(time_ant % NANT); - int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; - - //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); - //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); - inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); - ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); - -} - -// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels -// for first time, launch with 3072, 32 -__global__ void printer(half *inr, half *ini) { - - int idx = blockIdx.x*32+threadIdx.x; - float ir = __half2float(inr[idx]); - float ii = __half2float(ini[idx]); - - int chunnel = (int)(threadIdx.x % 16); - int channel = (int)(blockIdx.x/64); - int tt = (int)(blockIdx.x % 64); - int pol = (int)(tt/32); - int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); - - if (ir!=0. || ii!=0.) { - printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); - } - -} - - -// kernel for beamforming -/* - -Assumes that up to NANT antennas (nominally 63) are populated. - -Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) - -Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di - -Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. -for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) -use __float2int_rn, cosf, sinf intrinsics. - -Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. -Do it in tiles of 16 beams and 16 ants for - -Output array has order [beam, 48 frequency, 2 pol, 16 time] - -inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag -wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - -launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization - = 24576 blocks - -*/ -__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 24576 - int tidx = threadIdx.x; // assume 32 - int orig_bidx = (int)(bidx / 16); - int beam_tile = (int)(bidx % 16); - int stuff_tile = (int)(beam_tile % 4); - int data_offset = orig_bidx*1024; // offset for first part of data - int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight - weight_offset *= 16384; - int idx1, idx2; - int f_idx = (int)(orig_bidx % 96); - int tim_idx = (int)(orig_bidx / 96); - int oidx = f_idx*16 + tim_idx; - - // shared memory for convenience - __shared__ half summr[16][16]; // beam, chunnel - __shared__ float summi[16][16]; // beam, chunnel - - // accumulate real and imag parts into [16 beam x 16 f] fragments - // Declare the fragments. - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment wr_inr_frag; - wmma::fragment wr_ini_frag; - wmma::fragment wi_inr_frag; - wmma::fragment wi_ini_frag; - wmma::fragment ib_frag; - wmma::fragment final_frag; - - - // zero out accumulators - wmma::fill_fragment(wr_inr_frag, 0.0f); - wmma::fill_fragment(wr_ini_frag, 0.0f); - wmma::fill_fragment(wi_inr_frag, 0.0f); - wmma::fill_fragment(wi_ini_frag, 0.0f); - wmma::fill_fragment(ib_frag, 0.0f); - - // IB - if (stuffants==2) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - - } - - } - - // one ant per beam - if (stuffants==1) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - - } - if (stuffants!=1) { - - // loop over ant tiles - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - // copy weight and data to fragments, and multiply to accumulators - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); - - wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); - - wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); - - } - - // form real and imaginary matrices - for(int i=0; i < wr_inr_frag.num_elements; i++) { - wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real - wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag - wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared - } - } - - // at this stage the matrices are [beam, chunnel], and need to be summed over columns - - __syncthreads(); - - // copy back to shared mem - half *p1; - float *p2, tmp; - p1 = &summr[0][0]; - wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - if (stuffants!=1) { - - // now do thread reduction using multiplication by unity - wmma::fill_fragment(final_frag, 0.0f); - wmma::fill_fragment(b_frag, 1.0f); - wmma::load_matrix_sync(a_frag, p1, 16); - wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - // store - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; - } - - - } - - if (stuffants==1) { - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; - } - } - if (stuffants==2) { - - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); - tmp = 0.; - for (int i=0;i<16;i++) tmp += summi[i][i]; - if (tidx==0 && beam_tile==0) - output[(beam_tile*16+tidx)*1536 + oidx] = tmp; - - } - -} - -// kernel to calculate weights - needed because weights are halfs -// launch with 256 threads in 6144 blocks -__global__ -void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { - - // assume 256 threads in 6144 blocks - int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile - int tidx = threadIdx.x; - int f = (int)(bidx / 128); - int cc = (int)(bidx % 128); - int pol = (int)(cc / 64); - cc = (int)(cc % 64); - int beam_tile = (int)(cc / 4); - int ant_tile = (int)(cc % 4); - int beam_i = (int)(tidx / 16); - int ant_i = (int)(tidx % 16); - - int beam = beam_tile*16+beam_i; - int ant = ant_tile*16+ant_i; - int i = bidx*256+tidx; - int widx = ant*NW*2*2 + f*2*2 + pol*2; - - float theta = sep*(127.-beam*1.)*PI/10800.; // radians - float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate - float twr = cos(afac*antpos[ant]); - float twi = sin(afac*antpos[ant]); - - wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); - wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); - - -} - - -// function prototypes -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); -int init_weights(char *fnam, float *antpos, float *weights, char *flagants); -void reorder_block(char *block); -void calc_bp(float *data, float *bp, int pr); - - -// performs massive summation to calculate bp -// input array has order [beam, 96 frequency, 16 time] -// bp has size 48 - no way to avoid strided memory access -// returns factor to correct data -void calc_bp(float *data, float *bp, int pr) { - - int i=0; - - for (int b=0;b<256;b++) { - for (int f=0;f<48;f++) { - for (int a=0;a<32;a++) { - bp[b] += data[i]; - if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); - i++; - } - } - } - -} - -// for finding median of bandpass - -int cmpfunc(const void* elem1, const void* elem2) -{ - if(*(const float*)elem1 < *(const float*)elem2) - return -1; - return *(const float*)elem1 > *(const float*)elem2; -} - -void ret_med_bp(float *bp) { - - qsort(bp, 256, sizeof(float), cmpfunc); - float medval = 0.5*(bp[127]+bp[128]); - for (int i=0;i<256;i++) - bp[i] = medval; - -} - -// performs cpu reorder of block to be loaded to GPU -void reorder_block(char * block) { - - // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] - // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] - // 24576*NANT in total. 1536*NANT per time - - char * output = (char *)malloc(sizeof(char)*24576*NANT); - - for (int i=0;i<16;i++) { // over time - for (int j=0;j= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - int nints = NPACKETS / 16; - uint64_t nbytes_per_int = block_size / nints; - uint64_t nbytes_per_out = block_out / nints; - char * block; - unsigned char * output_buffer; - output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // allocate host and device memory for calculations - //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag - //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - char *d_indata[NSTREAMS]; - unsigned char *d_outdata[NSTREAMS]; - float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; - half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; - cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions - cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights - cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs - cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass - cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight - cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight - cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); - - float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); - char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); - float *bp = (float *)malloc(sizeof(float)*256); - unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); - - // streams and device - cudaStream_t stream[NSTREAMS]; - for (int st=0;st d1(d_inr[st]); - thrust::fill(d1, d1+16*48*2*64*16, 0.0); - thrust::device_ptr d2(d_ini[st]); - thrust::fill(d2, d2+16*48*2*64*16, 0.0); - } - - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - int blockct = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - blockct ++; - - // DO STUFF - - // calc weights - init_weights(fnam,antpos,weights,flagants); - cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); - calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); - if (DEBUG) syslog(LOG_INFO,"Finished with weights"); - - if (started==1) { - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // run adder kernel - adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); - - // copy to host - cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); - - // copy to output - for (int j=0;j<12288*4;j++) { - if (test_pattern) - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); - else - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; - } - if (DEBUG && bst*NSTREAMS+st==10) { - for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); - } - - } - } - - - } - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - - // calculate bandpass - - for (int i=0;i<256;i++) bp[i] = 0.; - - // do standard bf but calculate bandpass - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - //if (bst==0 && st==0) - // printer<<<3072, 32>>>(d_inr,d_ini); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // copy back to host - cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); - - // calculate bandpass - //if (st==0 && bst==0) - //calc_bp(h_transfer,bp,1); - calc_bp(h_transfer + st*256*96*16,bp,0); - ret_med_bp(bp); - - } - } - - // adjust bandpass - syslog(LOG_INFO,"Final BP..."); - for (int i=0;i<256;i++) { - syslog(LOG_INFO,"coeff %d %g",i,bp[i]); - if (bp[i]!=0.) { - bp[i] /= 48.*nints; - bp[i] = 2.5*128./bp[i]; - } - } - cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); - - // junk into output - memset(output_buffer,0,block_out); - - } - - // write output for debug - - // write to output - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - for (int st=0;st -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "cuda_fp16.h" -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include -#include - -#include -using namespace nvcuda; - -#define sep 1.0 - -// global variables -int DEBUG = 0; - - -// kernel for summing and requantizing -// input array has order [beam, 48 frequency, 2 pol, 16 time] -// need to output to [4 time, beam, 48 frequency] -// bp is scale factor for each beam -// run with 256*48=12288 blocks and 32 threads -__global__ -void adder(float *input, unsigned char *output, float *bp) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 256*48=12288 - int tidx = threadIdx.x; // assume 32 - //int fidx = 2*(bidx % 24); - int beamidx = (int)(bidx / 48); - - // declare shared mem - volatile __shared__ float data[32]; // data block to be summed - - // transfer from input to shared mem - data[tidx] = input[bidx*32+tidx]; - - // sync - __syncthreads(); - - // complete sum - if (tidx<16) { - data[tidx] += data[tidx+16]; // over pols - data[tidx] += data[tidx+2]; - data[tidx] += data[tidx+1]; - } - // now tidx = 0, 4, 8, 12 are what we want! - - __syncthreads(); - - // store - if (tidx == 0) - output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); - if (tidx == 4) - output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); - if (tidx == 8) - output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); - if (tidx == 12) - output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); - - /*if (tidx == 0) - output[bidx] = (unsigned char)(__float2int_rn(data[0])); - if (tidx == 4) - output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4])); - if (tidx == 8) - output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8])); - if (tidx == 12) - output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]));*/ - -} - -// kernel for promotion -/* -orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] -input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] -output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] -promoted to half precision - -launch with 16*48*NANT blocks of 32 threads - - */ -__global__ void promoter(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume 16*48*NANT - int tidx = threadIdx.x; // assume 32 - int iidx = bidx*32+tidx; - int pol = (int)(tidx % 2); - int chunnel = (int)(tidx / 2); - - /*int ant = (int)(bidx % NANT); - int time_chan = (int)(bidx / NANT); - int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ - - int chan = (int)(bidx % 48); - int time_ant = (int)(bidx / 48); - int tim = (int)(time_ant / NANT); - int ant = (int)(time_ant % NANT); - int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; - - //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); - //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); - inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); - ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); - -} - -// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels -// for first time, launch with 3072, 32 -__global__ void printer(half *inr, half *ini) { - - int idx = blockIdx.x*32+threadIdx.x; - float ir = __half2float(inr[idx]); - float ii = __half2float(ini[idx]); - - int chunnel = (int)(threadIdx.x % 16); - int channel = (int)(blockIdx.x/64); - int tt = (int)(blockIdx.x % 64); - int pol = (int)(tt/32); - int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); - - if (ir!=0. || ii!=0.) { - printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); - } - -} - - -// kernel for beamforming -/* - -Assumes that up to NANT antennas (nominally 63) are populated. - -Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) - -Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di - -Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. -for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) -use __float2int_rn, cosf, sinf intrinsics. - -Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. -Do it in tiles of 16 beams and 16 ants for - -Output array has order [beam, 48 frequency, 2 pol, 16 time] - -inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag -wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - -launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization - = 24576 blocks - -*/ -__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 24576 - int tidx = threadIdx.x; // assume 32 - int orig_bidx = (int)(bidx / 16); - int beam_tile = (int)(bidx % 16); - int stuff_tile = (int)(beam_tile % 4); - int data_offset = orig_bidx*1024; // offset for first part of data - int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight - weight_offset *= 16384; - int idx1, idx2; - int f_idx = (int)(orig_bidx % 96); - int tim_idx = (int)(orig_bidx / 96); - int oidx = f_idx*16 + tim_idx; - - // shared memory for convenience - __shared__ half summr[16][16]; // beam, chunnel - __shared__ float summi[16][16]; // beam, chunnel - - // accumulate real and imag parts into [16 beam x 16 f] fragments - // Declare the fragments. - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment wr_inr_frag; - wmma::fragment wr_ini_frag; - wmma::fragment wi_inr_frag; - wmma::fragment wi_ini_frag; - wmma::fragment ib_frag; - wmma::fragment final_frag; - - - // zero out accumulators - wmma::fill_fragment(wr_inr_frag, 0.0f); - wmma::fill_fragment(wr_ini_frag, 0.0f); - wmma::fill_fragment(wi_inr_frag, 0.0f); - wmma::fill_fragment(wi_ini_frag, 0.0f); - wmma::fill_fragment(ib_frag, 0.0f); - - // IB - if (stuffants==2) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - - } - - } - - // one ant per beam - if (stuffants==1) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - - } - if (stuffants!=1) { - - // loop over ant tiles - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - // copy weight and data to fragments, and multiply to accumulators - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); - - wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); - - wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); - - } - - // form real and imaginary matrices - for(int i=0; i < wr_inr_frag.num_elements; i++) { - wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real - wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag - wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared - } - } - - // at this stage the matrices are [beam, chunnel], and need to be summed over columns - - __syncthreads(); - - // copy back to shared mem - half *p1; - float *p2, tmp; - p1 = &summr[0][0]; - wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - if (stuffants!=1) { - - // now do thread reduction using multiplication by unity - wmma::fill_fragment(final_frag, 0.0f); - wmma::fill_fragment(b_frag, 1.0f); - wmma::load_matrix_sync(a_frag, p1, 16); - wmma::mma_sync(final_frag, a_frag, b_frag, final_frag); - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major); - - __syncthreads(); - - // store - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx]; - } - - - // do thread reduction for each beam - /* if (tidx<8) { - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1]; - } - if (tidx>=8 && tidx<16) { - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8]; - } - if (tidx>=16 && tidx<24) { - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16]; - } - if (tidx>=24) { - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24]; - }*/ - - /*if (tidx<16) - for (int j=1;j<16;j++) summr[tidx][0] += summr[tidx][j]; - - __syncthreads();*/ - - // now summr[beam][0] can go into output - /*if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0]; - }*/ - - } - - if (stuffants==1) { - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; - } - } - if (stuffants==2) { - - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); - tmp = 0.; - for (int i=0;i<16;i++) tmp += summi[i][i]; - if (tidx==0 && beam_tile==0) - output[(beam_tile*16+tidx)*1536 + oidx] = tmp; - - } - -} - -// kernel to calculate weights - needed because weights are halfs -// launch with 256 threads in 6144 blocks -__global__ -void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { - - // assume 256 threads in 6144 blocks - int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile - int tidx = threadIdx.x; - int f = (int)(bidx / 128); - int cc = (int)(bidx % 128); - int pol = (int)(cc / 64); - cc = (int)(cc % 64); - int beam_tile = (int)(cc / 4); - int ant_tile = (int)(cc % 4); - int beam_i = (int)(tidx / 16); - int ant_i = (int)(tidx % 16); - - int beam = beam_tile*16+beam_i; - int ant = ant_tile*16+ant_i; - int i = bidx*256+tidx; - int widx = ant*NW*2*2 + f*2*2 + pol*2; - - //float theta = sep*(127.-beam*1.)*PI/10800.; // radians - float theta = sep*(127.-beam*1.)*PI/10800.; // radians - float afac = -2.*PI*freqs[f*8+4]*sinf(theta)/CVAC; // factor for rotate - float twr = cos(afac*antpos[ant]); - float twi = sin(afac*antpos[ant]); - - wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); - wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); - - -} - - -// function prototypes -int dada_bind_thread_to_core (int core); -int init_weights(char *fnam, float *antpos, float *weights, char *flagants); -void reorder_block(char *block); -void calc_bp(float *data, float *bp, int pr); - - -// performs massive summation to calculate bp -// input array has order [beam, 96 frequency, 16 time] -// bp has size 48 - no way to avoid strided memory access -// returns factor to correct data -void calc_bp(float *data, float *bp, int pr) { - - int i=0; - - for (int b=0;b<256;b++) { - for (int f=0;f<48;f++) { - for (int a=0;a<32;a++) { - bp[b] += data[i]; - if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); - i++; - } - } - } - -} - -// for finding median of bandpass - -int cmpfunc(const void* elem1, const void* elem2) -{ - if(*(const float*)elem1 < *(const float*)elem2) - return -1; - return *(const float*)elem1 > *(const float*)elem2; -} - -void ret_med_bp(float *bp) { - - qsort(bp, 256, sizeof(float), cmpfunc); - float medval = 0.5*(bp[127]+bp[128]); - for (int i=0;i<256;i++) - bp[i] = medval; - -} - -// performs cpu reorder of block to be loaded to GPU -void reorder_block(char * block) { - - // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] - // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] - // 24576*NANT in total. 1536*NANT per time - - char * output = (char *)malloc(sizeof(char)*24576*NANT); - - for (int i=0;i<16;i++) { // over time - for (int j=0;j= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // get block sizes and allocate memory - uint64_t block_size = 198180864; - uint64_t block_out = 15*48*512*256; - char * block; - block = (char *)malloc(sizeof(char)*block_size); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - int nints = NPACKETS / 16; - uint64_t nbytes_per_int = block_size / nints; - uint64_t nbytes_per_out = block_out / nints; - unsigned char * output_buffer; - output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); - memset(output_buffer,0,block_out); - - // allocate host and device memory for calculations - //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag - //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - char *d_indata[NSTREAMS]; - unsigned char *d_outdata[NSTREAMS]; - float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; - half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; - cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions - cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights - cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs - cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass - cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight - cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight - cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); - - float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); - char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); - float *bp = (float *)malloc(sizeof(float)*256); - unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); - - // streams and device - cudaStream_t stream[NSTREAMS]; - for (int st=0;st d1(d_inr[st]); - thrust::fill(d1, d1+16*48*2*64*16, 0.0); - thrust::device_ptr d2(d_ini[st]); - thrust::fill(d2, d2+16*48*2*64*16, 0.0); - } - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - int blockct = 0; - - syslog(LOG_INFO, "starting observation"); - - // init weights - init_weights(fnam,antpos,weights,flagants); - cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); - calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); - if (DEBUG) syslog(LOG_INFO,"Finished with weights"); - - // open data file and read first block - FILE *fin; - fin=fopen(finnam,"rb"); - fread(block,sizeof(char),block_size,fin); - fclose(fin); - - // calculate bp - for (int i=0;i<256;i++) bp[i] = 0.; - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // copy back to host - cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); - - calc_bp(h_transfer + st*256*96*16,bp,0); - ret_med_bp(bp); - - } - } - - - // adjust bandpass - syslog(LOG_INFO,"Final BP..."); - for (int i=0;i<256;i++) { - //syslog(LOG_INFO,"coeff %d %g",i,bp[i]); - if (bp[i]!=0.) { - bp[i] /= 48.*nints; - bp[i] = 2.5*128./bp[i]; - } - } - cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); - - // open data file and read first block - fin=fopen(finnam,"rb"); - - // re-open file and loop over blocks - while (blocks<15) { - - syslog(LOG_INFO,"read blocks %d",blocks); - fread(block,sizeof(char),block_size,fin); - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // run adder kernel - adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); - - // copy to host - cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); - - // copy to output - for (int jj=0;jj<4;jj++) { - for (int bmn=0;bmn<256;bmn++) { - for (int j=0;j<48;j++) { - output_buffer[blocks*512*48*256 + (bst*NSTREAMS+st)*48*4*256+ jj*48*256 + bmn*48 + j] = tmp_buf[256*48*4*st + jj*256*48 + bmn*48 + j]; - } - } - } - - } - } - - blocks++; - - } - - syslog(LOG_INFO,"blocks %d",blocks); - - fclose(fin); - - float pwrs = 0; - if (!outpwr) { - fin=fopen("/home/ubuntu/data/tmp/output.dat","wb"); - for (int i=0;i<8192;i++) - fwrite(output_buffer + i*48*256 + outbm*48,sizeof(unsigned char),48,fin); - fclose(fin); - } - else { - fin=fopen("/home/ubuntu/data/tmp/output.dat","w"); - for (int i=0;i<15*512;i++) { - for (int j=0;j<256;j++) { - pwrs = 0.; - for (int k=0;k<48;k++) pwrs += (float)(output_buffer[i*256*48 + j*48 + k]); - fprintf(fin,"%f\n",pwrs); - } - } - fclose(fin); - } - - - - for (int st=0;stbL^AqF6`^PVuym!vM=iGD7 zJ@?#mmlX89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+ z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK) z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_ zaMtWuQ_nm1tl87fm_2K5Wz`u~L(eMu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs- z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-?? zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-je0pB=F~>67WAff&5ts z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaMrSttfVTZ$g#YwpWe*xsz66~ z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7 z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5( z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f zZB3XqyO2!S>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G zD^!tR5*$4Re}b?Cb?(M3?`)woE2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~; zbhGGEFofSkj#6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<; z1Fc}XEccfrnY$ zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?} zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^ z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un> z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa z#Rh(afuCvMFEj9E2Hre)tupWi-BKY;9Cv+)dqg6f!BM3EN(OK9LM4wgL z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j} z2|p^}sf3v-I;#XcfiP1;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR? z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ( zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4 zrZi5vfR822RK`gY@DYTW!Z6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+ z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1 zbE)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_h6!6)E zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz z$l0WOHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i= z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@ zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M zo9!<3~_9GzWHsD%;uh}Ha;hG*%s!>)Y#3OYwnq%rXr< zz#=a)JHy)NpU^*Jx06j%R+apl4 zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA; zuXzXQy{ffq^%puI^@>v);6T2$)hLdHhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70 z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia zxT%Nel>M;#sT&N^4wP=gV{%L`1708Exw9l3dII{C2&17(|V9;eB zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx zTrEUR!7=g<1zEedt1CZxQR zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD z3DsVc7w=gBI4WMp%Gn+1_E8O3&g2v89DaIJ6E zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^ z5Vu#WgrKdlu?Mm3~pJRAqQA!D6Jr} z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;PkL{f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y& z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e> zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-yz2gE1IrYFXbA8s z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R zf*#6N-BUjM!>-k!Q`5x9kiq9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~ z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>| zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_ zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S znF^jXAYME1Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1 z?sUNzu+*_GVNq!2Hgsic?L13(c7Yk&mrt z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ* z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#} z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0| zd+bxN|2_*l>Qk`SzCC5x3#>N-LRHJ+QYcAAi^r=Rt`fEj5e~F%yCH|V_0^MdU zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO& z5T^xp18U3iSuEej#&9C!a~{D3pTB^j3-RFW&lN|m|4D4>H69l? z^%S5G3t69i(RNN}62Ku>Qo#k1sFrF^$tG$x*1}yoQXfaUAR{ zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<e=l1~n z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10 zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE} zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&hECqpaFjLTT%YF(m94-{4*ZdR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw49kE(A-l5eLDnIv#OgLdJe#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L( zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3 z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0 z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqab zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)9l&62O3cpb{uoGq1Guu@FVU(&>D8u>ZWG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA zEd&PlJ?sNuz>}!-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8 zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3eG0vEow zHP(YFZUMOfbi&8mH`>ebZsgMi913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f z@i*&ufU+-ru=Wp(*9Xu4ZocT| zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E% zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7 zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8 z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=X=S2UV>mw~#{8dA2u0t-k zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+ z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb zn%-R~(7Ow={s1jn^gL_JbrxeUcX_N{BBJoDic=!h)> zf;?}vo!~2`p2J*u$Z=ggb+n3u%K&5h+eI#2foF(Bwrv z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2 zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG= z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5Mz>AtbgkCFCt zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7 z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y- zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2 zs<4-~(+9~Ey+j{kC+#UB zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5UrvG3)hRgaX5^o! z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~ zDBr(L=yyEJKk+EvCdG)w29HoWaozFL+~hC?^3@hHDb zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+hPP^6|1< z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BTy`=eDC%s-~XD>U&#NQNBK5cF67_g zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{#eF6{rFNBKfoF8pu3 zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^ zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT`d%h@Q<^fB4DLnfz0 z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ z{k^)jJ^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{ z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw96TmdeCo&Os4*d zK2HB96T`{-1~bq&M`3J5c{CJBEEpN_n==+dP9FjAE&>jALKzlk@SZCiat(% zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?cJzX$zt(#PV*Bt>uf z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe% z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH> z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8 z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(! zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8gSC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x zH!gpe-<+Jmt8uxJNf~J)acyQmKCX!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c zK&3yaN~epPMd7X_TNb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU z1=W(`u@pGjv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{ zblmp7@-o^hs88!M47+jiVPPU0m+O=!G{V!GHGvWph06 zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag` zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS z=t4fXT1mR?0MX58L`m8lu=w7kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1 zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKmlrTlgYmbPAqa!!vM?S�f{gAvg(Eu zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r* zvaK67htWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89 z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75< zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e> zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{> zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h33fym zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44 zuQk621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_> z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1 z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L) zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-rcL z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W2N3(rGtYVr`k?A=DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!#vh*%TH5%VNeqHiN4 zB9CM)85KDwapSltgq;*_o`@0|_vf<S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7)A$&>*6*`oB4fI zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@) zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|@;wQ;!Nt zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB z|2Ix}C`ZcXWv<}nrokdaSwhqrX^Jtx{-P) zoDc!mEeI4E(OJjSLQ8sYhnfPYYbg327#%6a7{ee@Qu+Sn0Pw8TP9=( z%OVK4Q=jdT)|5Z8c{_h*L6L9V( zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj>-O$ygMq0N7 zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a z9=pSp+033!WCQW7Xu`j$k zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O! z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@ zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMfo9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We` zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~ z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9& z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?qYFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD z(1@V1TM&U|7xSOFqb_JqY(V}*0YiBiy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH zb(FuUWOp8vzD+}Wi{A-ay|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW! z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c_=1Q2 zcbago7$fuImF%y5^ktS{o9Qq?2j{4 z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji zIQ5XrMO6XorATY+FsZd&SlA_&Qhu%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE& z*v($uLGfRk#hg|ioc_JcVv;NnB>#_iB>U(&LNm49G{u{HH z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz! zNJ#7a4c$r#6>&~%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u zCxA1m+~o5_D6W&wmI+8cuY=pk!g9b5QpVl$L2uRjb=TuUG>a zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6 z{C0EM zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5( znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg& z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n< zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2 zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I! znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7= zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJH8K zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${ zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL zfyT? zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~ zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`* zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?FM9r)2dV08UR3-C1;rCa)>CW4WOWa{& z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a3@4@4} zUI+8oc8S4*-V5tj@o~_L>%Z?HhYdjlCwj zuY4f`_l@A1@q_m;l-aG9xLL6 zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2 z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`lJD0qgVKH zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7nVRat z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt| z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB zu2YA)P7R!x{Q1z$JlLlWbvz?th6WE9X|LE)UCrEoo zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u-Vo$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g>b3#khu&@yEW6 z3DL=eXYdzT#mR%`@I=?igA?j$`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v? z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+ ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@Rm31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2 zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB zB@U<}c$AY9n>o!o&=l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV z@wyUjmH7l^J`D+F!tdCT0?x zS>d5jIy4Cd#cNKaptnN;$G)98TK^n7IFRCB;x_T|rs=fSIkdWs z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2 zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x|MAOA;u%nlVS^@#&u%Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8 z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4% za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8 zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3 z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>RMo7{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0 z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9 zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});> zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@ z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~Rluf9XF8b;hei=W zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9 zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U! zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4 zTh3S5dcMN8^A)Z*Ut#3Fl z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{ zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3 z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q| z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwHLOECuIRY|3L z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?) zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D| zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC( zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4 z^^d9gBNn}dgsF4^K6@%}4Yw3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4 zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta20p5Z=GT*pDOL--(nA)-%6cueq44L! z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y1>R&SG8` z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&> z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x2q!NbAud&FO4K9mam7WxPHtAqz z#8=Qkw*o78Y|FRF^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0 zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR6)u*a4ds_b0cO5 zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+ zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;) z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEcsav^{*99hPVLCFo5M+(ia+1be+ z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El zi(Uyw+PsFhfO~uTZoUN+=g>S z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^ocE7l6l2W`!0UFA@b|9FJnGiFm|~+ zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*Ia zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+ zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96 z$I+*uR8BMw?P(ks>L2C@GdYHhllcHPZ*)BvDO$b=r9lZ1 zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG! ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+ z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^ zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E& z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$ zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0 zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5 zJ?^Zo7Cx+!Y>iLT6E~R1pC34eJv$5j|5o{- z0{FjE`6}C{|5=rPrtPol@#pSSXtsC;YB)XHCp43DT)wVm50 zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3cKyJZ&H55#ZT4lZRk=g;_z)ub)C)= z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCNJ&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ( zb|)1-rnUF5gm}H}ov|8H_YuX$V)0L?_NDA(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y! zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL*G>r*oR6V*; zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?< z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$ zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U( z0C?~A4A(|E)_+hR_81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@yXjngCUsbsv>!Y9aQP!Wr$|Zgf zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf>zQ`W+zV*3k?_BsN zO7q3=z5w;lPyGWwsedeA;`aon*1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X z0qAXn{Rp-n57`gKyQOB3ii-Qx%388u6IyqSty_8hNU=`B_E z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz= z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@p% zDuVv|F|gyJ{==Wt=0q7RMTB)pKk%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw zAO(YYOaD>AzZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2 zN6FWKKV}&CFGF>4B@FhD{)`F`Dkx3- z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hdpll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO& zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO z1Ku0VbJ91<@w-}FRRv9;{?dFG z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>LC(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@ zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9B(f8d zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$= zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1 zKO_FD`ALZSKZC!GZQYhft@2n-%9r^Cc^YO! ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl z!wHWOt~7@CxS^#=^J%c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l|51_vgvPgu@_c_=A^hZT$tKA^ z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw z2<vOLDvweY{m`&eCqOVL1sDS77 zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr z<=?Ml{i!u$dHPA~g(M1|zrx` zBkb@O@R3}M=Qnac5%YmuK&B!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{ zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1THj)#3xD*Gm}I z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z# zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF zk^H|8>sb`8Grz&Vqh`E&BOaUBEEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&-LF7lv# zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7_TmP7UUO9bY{w#((=NHghRN=Ho_I8m& zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^JTAoB%I6f& z<8k`K_JHY=4UaFILont%4?Wb?9o zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+ zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{ z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A zsWt5K=Oz6l5EuP!dYFDZ0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO zB>fIOOn;^H_rM|bqhtXw)S~R4x)S*A-(9i8hF&d~#*#j|{+wUiS z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f z4u9nOECaAUl*CA2vj5f8e`dJT{}@eYClfbO`0nfkx z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89? z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA zDVh(@?UyiiLiRUA==!%Qt zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!-b;Rg{<_>MhHqE;kND7u z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L zGBSM%l#|LSdiMcVE1dXEnOv&u;m{qO1+>=F2*IgY=T{-T1$Z}t}$ zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^ z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4mu_Q~nC@ zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s z^^QG}y}0bT_8Q3NJofnFA=G!-NG>7^S z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B zz0mmVbLrmRA5+D! z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+ z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j zbP76OY?%6-ISh|9)~jmF1qWpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8& z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8 zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{ zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&420q>( z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjkos_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS z!&F{gAx@|?8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX& zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L z&_2i>9%LMy zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5oAsOY2l{KMG|Lo z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@ zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ zI}QDfD|q(D(BBc2o=tzTt|;|i>F{+?6m+4Q$o z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9 zckiX+fPg1{u>Elk6 zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6 zKc#%WM3x^xIrr4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$= z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT z`z6-Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7 z*7;#-1oEWyBEBe%^`<#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT z)c1U%E_MiI=7Yh z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6 zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw% zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~ z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8 zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(! ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+- z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{ zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z% zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I* zc%Mvx1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^ z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M&@OL^|A`&}|+-QD$A z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w z`i%O4{l6|AS%^;@jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55 zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~ zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7RbVvq!7C z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH& z_<<&X?;hUMIQj$evIcI{^+5qrjk(K2St z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6% zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2} zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjUY`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K zbet&rIezk7%wPFwt>pcBH1TnK6QkV(p5dWMWnw;YwAzLzktoAp?*!=rZ7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA-8J7HZ;?;;kmM4>@yM zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9 zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr` zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9| zL~v11|9T+2IOy{{>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENOKBJkiuYYD-cpO@B?I?XTea z;zRz%OyWyIUtd!4C87OBh|fjzw-gh`CjeTK=F?w+I61N zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP*iFtI(d4k2Pk%%}f1zt*UUPiPuG ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?Traa=`eLW2Z1+Ux1oSsjTor^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>UouSx6?%s_UKi!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8 z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv z!2c`}NueW<{%MK-x24*KfEXwXKQtVTqt2)OKkU7CbX3(IKl zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?OjwOklc6Q zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9 zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`= zK<^rMK(~DTP{vGAG&wK~}Xrknd zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3 zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wRE=WVFjSzg zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr) zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB6Qy1ru~=%J0|`e4?Z0Nobi6bsccnu zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG79CeUe8p0AVPMJpz^l-115-}%O&mDtMOa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}aOJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i z-n|}I}i!(G}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_ z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE; z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C z33S+w4oGh{6xqqDe~SniuFr za-N+($++Dv^e+Fe9=eBlfvHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3 zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1 zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i-m zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2 z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2 z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)? zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3! zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac z97E0TOG-g8$VVw2$WAbv(KagxP&)tAt@AksheCB8Bs5 zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L z#9vbQR=6bIb)cTvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{< zB@g8Pbh#t{bRlKaGDOObwYnpR;?T z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3 z=`mZ5@K~u+$-{q?19MZM-JxpbKPFXB(Y`cyhu z5kV+sY-1{YEZN|J?Tpw>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ` zOZwE_@W&_>K?8FwuOcOsA(ivjk8()vGU+es z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_ zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8 zln!&t6nHwD_abZK6ha>QjxEeJ$ti}Ir*t^hdrwY0g{ZtYJB5bⅈLtVZ+07i53u_G~b3r)s!SYf}Ev4 zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y68i2k)}w)z9{EiU1c5!5Sq2C+L` zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6 z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf< zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0 zUs5tj6|2~xY-NbAsFy+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$- zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z> z(OMU>SV9DGq`?-EC?yZUhb?keaJDncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM* z&%}|O%Q1CKNFOe*LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}d z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0IMT1GVbE1245kFbQ zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ? zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4Vu<(0hIWkISeG_ zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5 zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH z%5m^9ql!X29yH3aDCpy0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps z~GyqK)-rtlkV3XSa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db zCQP0!=~*nr75v@cE%O+Xp>c6Rwt&KEtVujihn3E$Ie$CMITPntJd1%9OXfL$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K; zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValaztR0UBMjSstghdsJ5T`|HT)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{< zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@ z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr ziPjej===WcG~#oI`F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj z9jyI0gjXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$tJX{{}a1S*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx08g5f~%!6D8S7fN)?H1>{_f&I23 zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92 zA@^*>SZK@$XU!@L9UNx$dG2urK^;3Bs@$p># z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7 zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%! zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+ z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%lqL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78 zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3 zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gpzJF0i5MQ{| z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN# zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~ zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g>b$LSV}&CV?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo& zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+ z^E!QFNo%*!DKg5lZ7 zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~ zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9 zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUgi8%a>RY{2B@NiA-%DU>> z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOaw@_+-#@jr-WYF$P{~@;l(bJ zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G* zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3 zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@ zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@ zO@>euGu|$}Od3^S|lxH(uMnag^+% z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n z?Hnh90Ev8l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv zs`^z`r?r|bw3-qzJ63c8G zcgVTJeR-E#Ru09xNXg7(u#+L;_CeYH)JUe|I;yrfEf zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K( zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD3_u%8pdQZ7`11JDWA`+_NM~4 zHNoc_*U;2@`s7s4xf55Sf?C)O!!63;zj7NN(Di%UW%vK=0lDGfVC zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN zPT&&Y-Z5?MJAnJfwzaw6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v- za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;>*wxDfd3l`sz+b`8t} zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d! z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm5TkT|- z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@sZSsJ64xFng)S21#){K_oxF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO# zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ z3;cfIe%-NtK6ZDjaqM%4I#$P zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(ErY(qYK|xAk#7 z~KyJm5+Cvp*`vU zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ z+W$a&$=iDtIoAJ)V*z>5`*1i>eRFRL( zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%FLC0O)}rU0*u zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He z8*Ib>TfzSf{286hPmP&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7} zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0 zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+VkDG4=KjX5V^XFFZPXT{WNArsf<3{W$MDVU z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~ zk8F4>eI^| z;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5OguzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5` zW=~st-%k9zxIeHEOfO=spu2Pv@%=LRORzqjg#YdYKl%Qi z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9Ozx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb! zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW zHb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#( z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm|*5URUG%o2z>v2uI0fkIx? z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@* zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuTPFgkA@?~!~zqGO{_3+j)@Ty zSDLuN#4AnQZQ_F_?lAr~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0 z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{ zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7 zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ z)(E552_V5Ak)8TO^cQ*yvb+55HXKB?tWP zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@ zyYPSz@f?PVB?=>5OBTEKSp_ray9Rv-eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8 z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0 zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`JV;gPjR&Z8;*tJnU9VYp8U^4ZM zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r-IWir% z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r?%UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^ zlVm3CJZG+w#Fdgz8| zZ-G&!&4qfOAwNaCURv>pg>@70gnM;rt6QUyBYYXF4b$>xc`tm5 z_x4PjeA(Tf9K1{tr5N&mhHAoim&vA-~ zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}? z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY} z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWVX$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv zVN{={Bes&)eJjD z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9 zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv& zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#* z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e; zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y z#z7NMAsAD}rqH@irl`H>U78A{Z%oV~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo! zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o}Z*U zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X)z-N~b} zFxPiGS|(u^UelQF<{T^B=CE$d`NRJ($HxC?!-N$!qr+pupF2!{MivkB+R^8AwZu;LWzsA{6*xwg)M#st* z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1 z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r zoWF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($ z7L)EY*RPXI`6o`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4 zJXe|WCHIvXdNH>sp4(0NSoz<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X ze(2PwJ+biq#k9ZIEm3-GRnPVT|}wWj=*lcVzg zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7b(a|``4S{ zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq| zf1@d1aY7yhdH+U3Cj9B`6Q!GE z8iDd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9 z6u)+@;2S=0gyEb;R+VX52~P| zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo- z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae< zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^ zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr? z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`VqSHz@u?4A+~`>7NqdKhk`@ z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2 z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ% zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a-> z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ zSa(Q6@z9{?L^j@nMGdIRE-s zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9f^=n(23=jJTe zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66 zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0 z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7 zSoIc`s(F9OE8@-mhUd3doIdAv|hpUy*FYks3qt)vRA zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)& zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{} zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV= zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{ z@Zo@0aKJ6#(*omvV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn3lYA-UDTN!4pqi{ukuP}2gx9uys_<+ea&bv6tCXM)t5 zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${ z)QCMkh_AaM-WXT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7 zn+ApWi)FaPcA;ivuvkE8 z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7tAOQN3AY{^m8 z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M) zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^a`;BfI z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e> zTWdTCb<(m8XcW8*ubp5i?uWw z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$ zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~; z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+ zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p z8)XIC=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;FXv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~ zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX zIZ3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQeo+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkww9^3@(?ul8Dk4L!#M&`0^PrpD ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE03?uwb!Y6Yj%W@MY0CYL?N=!)V z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3 zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0 zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$ zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonPei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0X5I&g{hWH!n{?&8Anw zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5 zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg zM+k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y& zAUFnXo1^}h*Y zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~ z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^ zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w) z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C z4c|I66g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`} z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA* zPq}siGxH68?DpRa7-5?6+rNH52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*43GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ z&%Uxx%D>Yf>e2v`9W;hLBeHUPH S<)8BzNq -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "cuda_fp16.h" -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include -#include - -#include -using namespace nvcuda; - -#define sep 1.0 - -// global variables -int DEBUG = 0; - - -// kernel for summing and requantizing -// input array has order [beam, 48 frequency, 2 pol, 16 time] -// need to output to [4 time, beam, 48 frequency] -// bp is scale factor for each beam -// run with 256*48=12288 blocks and 32 threads -__global__ -void adder(float *input, unsigned char *output, float *bp) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 256*48=12288 - int tidx = threadIdx.x; // assume 32 - //int fidx = 2*(bidx % 24); - int beamidx = (int)(bidx / 48); - - // declare shared mem - __shared__ float data[32]; // data block to be summed - - // transfer from input to shared mem - data[tidx] = input[bidx*32]; - - // sync - __syncthreads(); - - // complete sum - if (tidx<16) { - data[tidx] += data[tidx+16]; // over pols - - data[tidx] += data[tidx+2]; - data[tidx] += data[tidx+1]; - } - // now tidx = 0, 4, 8, 12 are what we want! - - __syncthreads(); - - // store - if (tidx == 0) - output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2); - if (tidx == 4) - output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2); - if (tidx == 8) - output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2); - if (tidx == 12) - output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2); - -} - -// kernel for promotion -/* -orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] -input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] -output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] -promoted to half precision - -launch with 16*48*NANT blocks of 32 threads - - */ -__global__ void promoter(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume 16*48*NANT - int tidx = threadIdx.x; // assume 32 - int iidx = bidx*32+tidx; - int pol = (int)(tidx % 2); - int chunnel = (int)(tidx / 2); - - /*int ant = (int)(bidx % NANT); - int time_chan = (int)(bidx / NANT); - int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/ - - int chan = (int)(bidx % 48); - int time_ant = (int)(bidx / 48); - int tim = (int)(time_ant / NANT); - int ant = (int)(time_ant % NANT); - int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel; - - inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4)); - ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4)); - -} - -// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels -// for first time, launch with 3072, 32 -__global__ void printer(half *inr, half *ini) { - - int idx = blockIdx.x*32+threadIdx.x; - float ir = __half2float(inr[idx]); - float ii = __half2float(ini[idx]); - - int chunnel = (int)(threadIdx.x % 16); - int channel = (int)(blockIdx.x/64); - int tt = (int)(blockIdx.x % 64); - int pol = (int)(tt/32); - int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16)); - - if (ir!=0. || ii!=0.) { - printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii); - } - -} - - -// kernel for beamforming -/* - -Assumes that up to NANT antennas (nominally 63) are populated. - -Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted) - -Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di - -Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. -for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang) -use __float2int_rn, cosf, sinf intrinsics. - -Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. -Do it in tiles of 16 beams and 16 ants for - -Output array has order [beam, 48 frequency, 2 pol, 16 time] - -inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag -wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - -launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization - = 24576 blocks - -*/ -__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) { - - // get block and thread ids - int bidx = blockIdx.x; // assume 24576 - int tidx = threadIdx.x; // assume 32 - int orig_bidx = (int)(bidx / 16); - int beam_tile = (int)(bidx % 16); - int stuff_tile = (int)(beam_tile % 4); - int data_offset = orig_bidx*1024; // offset for first part of data - int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight - weight_offset *= 16384; - int idx1, idx2; - int f_idx = (int)(orig_bidx % 96); - int tim_idx = (int)(orig_bidx / 96); - int oidx = f_idx*16 + tim_idx; - - // shared memory for convenience - __shared__ float summr[16][16]; // beam, chunnel - __shared__ float summi[16][16]; // beam, chunnel - - // accumulate real and imag parts into [16 beam x 16 f] fragments - // Declare the fragments. - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment wr_inr_frag; - wmma::fragment wr_ini_frag; - wmma::fragment wi_inr_frag; - wmma::fragment wi_ini_frag; - wmma::fragment ib_frag; - - // zero out accumulators - wmma::fill_fragment(wr_inr_frag, 0.0f); - wmma::fill_fragment(wr_ini_frag, 0.0f); - wmma::fill_fragment(wi_inr_frag, 0.0f); - wmma::fill_fragment(wi_ini_frag, 0.0f); - wmma::fill_fragment(ib_frag, 0.0f); - - // IB - if (stuffants==2) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag); - - } - - } - - // one ant per beam - if (stuffants==1) { - - wmma::fragment c_frag; - wmma::fragment d_frag; - wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16); - wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16); - wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag); - - } - if (stuffants!=1) { - - // loop over ant tiles - for (int ant_tile=0; ant_tile<4; ant_tile++) { - - // copy weight and data to fragments, and multiply to accumulators - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16); - wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag); - - wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag); - - wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16); - wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag); - - wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16); - wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag); - - } - - // form real and imaginary matrices - for(int i=0; i < wr_inr_frag.num_elements; i++) { - wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real - wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag - wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared - } - } - - // at this stage the matrices are [beam, chunnel], and need to be summed over columns - - // copy back to shared mem - float *p1, *p2, tmp; - p1 = &summr[0][0]; - wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major); - - if (stuffants!=1) { - - // do thread reduction for each beam - if (tidx<8) { - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2]; - for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1]; - } - if (tidx>=8 && tidx<16) { - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8]; - for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8]; - } - if (tidx>=16 && tidx<24) { - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16]; - for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16]; - } - if (tidx>=24) { - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24]; - for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24]; - } - - __syncthreads(); - - // now summr[beam][0] can go into output - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0]; - } - - } - - if (stuffants==1) { - if (tidx<16) { - output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx]; - } - } - if (stuffants==2) { - - p2 = &summi[0][0]; - wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major); - tmp = 0.; - for (int i=0;i<16;i++) tmp += summi[i][i]; - if (tidx==0 && beam_tile==0) - output[(beam_tile*16+tidx)*1536 + oidx] = tmp; - - } - -} - -// kernel to calculate weights - needed because weights are halfs -// launch with 256 threads in 6144 blocks -__global__ -void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) { - - // assume 256 threads in 6144 blocks - int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile - int tidx = threadIdx.x; - int f = (int)(bidx / 128); - int cc = (int)(bidx % 128); - int pol = (int)(cc / 64); - cc = (int)(cc % 64); - int beam_tile = (int)(cc / 4); - int ant_tile = (int)(cc % 4); - int beam_i = (int)(tidx / 16); - int ant_i = (int)(tidx % 16); - - int beam = beam_tile*16+beam_i; - int ant = ant_tile*16+ant_i; - int i = bidx*256+tidx; - int widx = ant*NW*2*2 + f*2*2 + pol*2; - - float theta = sep*(127.-beam*1.)*PI/10800.; // radians - float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate - float twr = cos(afac*antpos[ant]); - float twi = sin(afac*antpos[ant]); - - wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1])); - wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1])); - - -} - - -// function prototypes -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2); -int dada_bind_thread_to_core (int core); -int init_weights(char *fnam, float *antpos, float *weights, char *flagants); -void reorder_block(char *block); -void calc_bp(float *data, float *bp, int pr); - - -// performs massive summation to calculate bp -// input array has order [beam, 96 frequency, 16 time] -// bp has size 48 - no way to avoid strided memory access -// returns factor to correct data -void calc_bp(float *data, float *bp, int pr) { - - int i=0; - - for (int b=0;b<256;b++) { - for (int f=0;f<48;f++) { - for (int a=0;a<32;a++) { - bp[b] += data[i]; - if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]); - i++; - } - } - } - -} - -// performs cpu reorder of block to be loaded to GPU -void reorder_block(char * block) { - - // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i] - // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i] - // 24576*NANT in total. 1536*NANT per time - - char * output = (char *)malloc(sizeof(char)*24576*NANT); - - for (int i=0;i<16;i++) { // over time - for (int j=0;j= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - hdu_out2 = dada_hdu_create (0); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - int nints = NPACKETS / 16; - uint64_t nbytes_per_int = block_size / nints; - uint64_t nbytes_per_out = block_out / nints; - char * block; - unsigned char * output_buffer; - output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // allocate host and device memory for calculations - //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag - //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant] - char *d_indata[NSTREAMS]; - unsigned char *d_outdata[NSTREAMS]; - float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs; - half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS]; - cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions - cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights - cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs - cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass - cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight - cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight - cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice); - - float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS); - char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2); - float *bp = (float *)malloc(sizeof(float)*256); - unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS); - - // streams and device - cudaStream_t stream[NSTREAMS]; - for (int st=0;st d1(d_inr[st]); - thrust::fill(d1, d1+16*48*2*64*16, 0.0); - thrust::device_ptr d2(d_ini[st]); - thrust::fill(d2, d2+16*48*2*64*16, 0.0); - } - - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - int blockct = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - blockct ++; - - // write to output - /* written = ipcio_write (hdu_out2->data_block, block, block_out2); - if (written < block_out2) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - }*/ - - // DO STUFF - - // calc weights - init_weights(fnam,antpos,weights,flagants); - cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice); - calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi); - if (DEBUG) syslog(LOG_INFO,"Finished with weights"); - - if (started==1) { - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // run adder kernel - adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp); - - // copy to host - cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]); - - // copy to output - for (int j=0;j<12288*4;j++) { - if (test_pattern) - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32); - else - output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st]; - } - if (DEBUG && bst*NSTREAMS+st==10) { - for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]); - } - - } - } - - - } - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - - // calculate bandpass - - for (int i=0;i<256;i++) bp[i] = 0.; - - // do standard bf but calculate bandpass - - // loop over ints - for (int bst=0;bst>>(d_indata[st], d_inr[st], d_ini[st]); - - //if (bst==0 && st==0) - // printer<<<3072, 32>>>(d_inr,d_ini); - - // run beamformer kernel - beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants); - - // copy back to host - cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]); - - // calculate bandpass - //if (st==0 && bst==0) - //calc_bp(h_transfer,bp,1); - calc_bp(h_transfer + st*256*96*16,bp,0); - - } - } - - // adjust bandpass - syslog(LOG_INFO,"Final BP..."); - for (int i=0;i<256;i++) { - syslog(LOG_INFO,"coeff %d %g",i,bp[i]); - if (bp[i]!=0.) { - bp[i] /= 48.*nints; - bp[i] = 128./bp[i]/4.; - } - } - cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice); - - // junk into output - memset(output_buffer,0,block_out); - - } - - // write output for debug - - // write to output - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - for (int st=0;st -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -f file to read packet from [default none]\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - useZ = 0; - strcpy(fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - uint64_t npackets = block_out / 4194304; - char * block, * output_buffer; - char * packet; - packet = (char *)malloc(sizeof(char)*4194304); - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // fill output buffer if file exists - FILE *fin; - if (!useZ) { - - if (!(fin=fopen(fnam,"rb"))) { - syslog(LOG_ERR, "cannot open file - will write zeros"); - } - else { - - fread(packet,4194304,1,fin); - fclose(fin); - - syslog(LOG_INFO,"Read packet, npackets %llu",npackets); - - for (int i=0;idata_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - // no need to do anything here - output_buffer is ready to go - - // write to output - written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(packet); - free(output_buffer); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_capture.c b/src/dsaX_capture.c deleted file mode 100644 index 054e45d..0000000 --- a/src/dsaX_capture.c +++ /dev/null @@ -1,1080 +0,0 @@ -/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer. - -1;95;0cmain: runs capture loop, and interfaces dada buffer -control_thread: deals with control commands - -*/ - -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" -//#include "multilog.h" - -#define unhappies 3000 -#define skips 6 -#define sleeps 1.5 - -/* global variables */ -int quit_threads = 0; -char STATE[20]; -uint64_t UTC_START = 10000; -uint64_t UTC_STOP = 40000000000; -int MONITOR = 0; -char iP[100]; -int DEBUG = 0; -int HISTOGRAM[16]; -int cPort = CAPTURE_CONTROL_PORT; -int dPort = CAPTURE_PORT; - -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - - - -} - -void usage() -{ - fprintf (stdout, - "dsaX_capture [options]\n" - " -c core bind process to CPU core [no default]\n" - " -j IP to listen on for data packets [no default]\n" - " -p PORT to listen to for data packets [default 4011]\n" - " -q PORT to listen to for control commands [default CAPTURE_CONTROL_PORT]\n" - " -i IP to listen on for control commands [no default]\n" - " -f filename of template dada header [no default]\n" - " -o out_key [default CAPTURE_BLOCK_KEY]\n" - " -d send debug messages to syslog\n" - " -h print usage\n"); -} - -/* - * create a socket with the specified number of buffers - */ -dsaX_sock_t * dsaX_init_sock () -{ - dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); - assert(b != NULL); - - b->bufsz = sizeof(char) * UDP_PAYLOAD; - - b->buf = (char *) malloc (b->bufsz); - assert(b->buf != NULL); - - b->have_packet = 0; - b->fd = 0; - - return b; -} - -void dsaX_free_sock(dsaX_sock_t* b) -{ - b->fd = 0; - b->bufsz = 0; - b->have_packet =0; - if (b->buf) - free (b->buf); - b->buf = 0; -} - -/* - * intialize UDP receiver resources - */ -int dsaX_udpdb_init_receiver (udpdb_t * ctx) -{ - syslog(LOG_INFO,"dsax_udpdb_init_receiver()"); - - // create a dsaX socket which can hold variable num of UDP packet - ctx->sock = dsaX_init_sock(); - - ctx->ooo_packets = 0; - ctx->recv_core = -1; - ctx->n_sleeps = 0; - ctx->mb_rcv_ps = 0; - ctx->mb_drp_ps = 0; - ctx->block_open = 0; - ctx->block_count = 0; - ctx->capture_started = 0; - ctx->last_seq = 0; - ctx->last_byte = 0; - ctx->block_start_byte = 0; - - // allocate required memory strucutres - ctx->packets = init_stats_t(); - ctx->bytes = init_stats_t(); - - syslog(LOG_INFO,"receiver inited"); - - return 0; -} - -/* -prepare socket and writer -*/ - -int dsaX_udpdb_prepare (udpdb_t * ctx) -{ - syslog(LOG_INFO, "dsaX_udpdb_prepare()"); - - // open socket - syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); - ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose); - if (ctx->sock->fd < 0) { - syslog (LOG_ERR, "Error, Failed to create udp socket"); - return -1; - } - - - // set the socket size to 256 MB - int sock_buf_size = 4*1024*1024; - syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); - dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size); - - // set the socket to non-blocking - syslog(LOG_INFO, "prepare: setting non_block"); - sock_nonblock(ctx->sock->fd); - - // clear any packets buffered by the kernel - syslog(LOG_INFO, "prepare: clearing packets at socket"); - size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD); - - // setup the next_seq to the initial value - //ctx->last_seq = 0; - //ctx->last_byte = 0; - //ctx->n_sleeps = 0; - - return 0; -} - -/* - * reset receiver before an observation commences - */ -void dsaX_udpdb_reset_receiver (udpdb_t * ctx) -{ - syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()"); - - ctx->capture_started = 0; - ctx->last_seq = 0; - ctx->last_byte = 0; - ctx->n_sleeps = 0; - - reset_stats_t(ctx->packets); - reset_stats_t(ctx->bytes); -} - -/* - * open a data block buffer ready for direct access - */ -int dsaX_udpdb_open_buffer (udpdb_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); - - if (ctx->block_open) - { - syslog (LOG_ERR, "open_buffer: buffer already opened"); - return -1; - } - - if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); - - uint64_t block_id = 0; - - ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id); - if (!ctx->block) - { - syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); - return -1; - } - - ctx->block_open = 1; - ctx->block_count = 0; - - return 0; -} - -/* - * close a data buffer, assuming a full block has been written - */ -int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); - - if (!ctx->block_open) - { - syslog (LOG_ERR, "close_buffer: buffer already closed"); - return -1; - } - - // log any buffers that are not full, except for the 1 byte "EOD" buffer - if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) - syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " - "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", - bytes_written, ctx->hdu_bufsz); - - if (eod) - { - if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); - return -1; - } - } - else - { - if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); - return -1; - } - } - - ctx->block = 0; - ctx->block_open = 0; - - return 0; -} - -/* - * move to the next ring buffer element. return pointer to base address of new buffer - */ -int dsaX_udpdb_new_buffer (udpdb_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); - - if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); - return -1; - } - - if (dsaX_udpdb_open_buffer (ctx) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); - return -1; - } - - // increment buffer byte markers - ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; - ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; - - // set block to 0 - //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte); - - if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", - ctx->block_start_byte, ctx->block_end_byte); - - return 0; - -} - -/* - * destroy UDP receiver resources - */ -int dsaX_udpdb_destroy_receiver (udpdb_t * ctx) -{ - if (ctx->sock) - dsaX_free_sock(ctx->sock); - ctx->sock = 0; -} - -/* - * Close the udp socket and file - */ - -int udpdb_stop_function (udpdb_t* ctx) -{ - - syslog(LOG_INFO, "stop: dada_hdu_unlock_write()"); - if (dada_hdu_unlock_write (ctx->hdu) < 0) - { - syslog (LOG_ERR, "stop: could not unlock write on"); - return -1; - } - - // close the UDP socket - close(ctx->sock->fd); - - if (ctx->packets->dropped) - { - double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte; - percent *= 100; - - syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %", - ctx->bytes->dropped, ctx->last_byte, percent); - } - - return 0; -} - - - - -/* --------- THREADS -------- */ - -// STATS THREAD - -/* - * Thread to print simple capture statistics - */ -void stats_thread(void * arg) { - - /* // set affinity - const pthread_t pid = pthread_self(); - const int core_id = 4; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - */ - - udpdb_t * ctx = (udpdb_t *) arg; - uint64_t b_rcv_total = 0; - uint64_t b_rcv_1sec = 0; - uint64_t b_rcv_curr = 0; - - uint64_t b_drp_total = 0; - uint64_t b_drp_1sec = 0; - uint64_t b_drp_curr = 0; - - uint64_t s_rcv_total = 0; - uint64_t s_rcv_1sec = 0; - uint64_t s_rcv_curr = 0; - - uint64_t ooo_pkts = 0; - float gb_rcv_ps = 0; - float mb_rcv_ps = 0; - float mb_drp_ps = 0; - - syslog(LOG_INFO,"stats_thread: starting loop"); - - while (!quit_threads) - { - - /* get a snapshot of the data as quickly as possible */ - b_rcv_curr = ctx->bytes->received; - b_drp_curr = ctx->bytes->dropped; - s_rcv_curr = ctx->n_sleeps; - - /* calc the values for the last second */ - b_rcv_1sec = b_rcv_curr - b_rcv_total; - b_drp_1sec = b_drp_curr - b_drp_total; - s_rcv_1sec = s_rcv_curr - s_rcv_total; - - /* update the totals */ - b_rcv_total = b_rcv_curr; - b_drp_total = b_drp_curr; - s_rcv_total = s_rcv_curr; - - mb_rcv_ps = (double) b_rcv_1sec / 1000000; - mb_drp_ps = (double) b_drp_1sec / 1000000; - gb_rcv_ps = b_rcv_1sec * 8; - gb_rcv_ps /= 1000000000; - - /* determine how much memory is free in the receivers */ - syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped 0", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq); - - sleep(1); - } - -} - - - - - - - -// CONTROL THREAD - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = cPort; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - - // INTERPRET BUFFER STRING - // receive either UTC_START, UTC_STOP, MONITOR - - // interpret buffer string - char * rest = buffer; - char *cmd, *val; - cmd = strtok_r(rest, "-", &rest); - val = strtok_r(rest, "-", &rest); - syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); - - if (strcmp(cmd,"UTC_START")==0) - UTC_START = strtoull(val,&endptr,0); - - if (strcmp(cmd,"UTC_STOP")==0) - UTC_STOP = strtoull(val,&endptr,0); - - close(fd); - - } - - free (buffer); - - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - -// MAIN of program - -int main (int argc, char *argv[]) { - - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_capture", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit for writing */ - dada_hdu_t* hdu_out = 0; - - /* actual struct with info */ - udpdb_t udpdb; - - // input data block HDU key - key_t out_key = CAPTURE_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg=0; - char dada_fnam[200]; // filename for dada header - char iface[100]; // IP for data packets - - while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1) - { - switch (arg) - { - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - strcpy(iP,optarg); - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'p': - if (optarg) - { - dPort = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-p flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'q': - if (optarg) - { - cPort = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-q flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'j': - if (optarg) - { - strcpy(iface,optarg); - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - strcpy(dada_fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // record STATE info - sprintf(STATE,"NOBUFFER"); - - // START THREADS - - // start control thread - int rval = 0; - pthread_t control_thread_id, stats_thread_id; - if (DEBUG) - syslog (LOG_DEBUG, "Creating threads"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort); - - // start the stats thread - rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "started stats_thread()"); - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // initialize the data structure - syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()"); - if (dsaX_udpdb_init_receiver (&udpdb) < 0) - { - syslog (LOG_ERR, "could not initialize receiver"); - return EXIT_FAILURE; - } - - - // OPEN CONNECTION TO DADA DB FOR WRITING - - if (DEBUG) syslog(LOG_INFO,"Creating HDU"); - - hdu_out = dada_hdu_create (0); - if (DEBUG) syslog(LOG_INFO,"Created hdu"); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog(LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (DEBUG) syslog(LOG_INFO,"Connected HDU"); - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_out); - syslog(LOG_ERR,"could not lock to output dada buffer"); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"opened connection to output DB"); - - // DEAL WITH DADA HEADER - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - if (DEBUG) syslog(LOG_INFO,"read header2"); - - if (fileread (dada_fnam, hout, 4096) < 0) - { - free (hout); - syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); - return (EXIT_FAILURE); - } - - - if (DEBUG) syslog(LOG_INFO,"read header3"); - - - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - - - // copy the in header to the out header - memcpy (header_out, hout, 4096); - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) - { - syslog(LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - sprintf(STATE,"LISTEN"); - syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); - - - /* time to start up receiver. - data are captured on iface:CAPTURE_PORT - */ - - - // put information in udpdb struct - udpdb.hdu = hdu_out; - udpdb.port = dPort; - udpdb.interface = strdup(iface); - udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - // determine number of packets per block, must - if (udpdb.hdu_bufsz % UDP_DATA != 0) - { - syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA); - return EXIT_FAILURE; - } - udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA; - udpdb.bytes_to_acquire = 0; - udpdb.num_inputs = NSNAPS; - - // prepare the socket - syslog(LOG_INFO, "main: dsaX_udpdb_prepare()"); - if (dsaX_udpdb_prepare (&udpdb) < 0) - { - syslog(LOG_ERR, "could allocate required resources (prepare)"); - return EXIT_FAILURE; - } - - // reset the receiver - syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()"); - dsaX_udpdb_reset_receiver (&udpdb); - - // open a block of the data block, ready for writing - if (dsaX_udpdb_open_buffer (&udpdb) < 0) - { - syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed"); - return -1; - } - - /* START WHAT WAS in RECV THREAD */ - - // DEFINITIONS - - // lookup table for ant order - uint64_t ant_lookup[100], vv; - for (int i=0;i<100;i++) ant_lookup[i] = 0; - for (int i=0;ibuf; - size_t got = 0; // data received from a recv_from call - int errsv; // determine the sequence number boundaries for curr and next buffers - int64_t byte_offset = 0; // offset of current packet in bytes from start of block - uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs - // for "saving" out of order packets near edges of blocks - unsigned int temp_idx = 0; - unsigned int temp_max = 1000; - char ** temp_buffers; //[temp_max][UDP_DATA]; - uint64_t * temp_seq_byte; - temp_buffers = (char **)malloc(sizeof(char *)*temp_max); - for (int i=0;ihave_packet = 0; - - // incredibly tight loop to try and get a packet - while (!udpdb.sock->have_packet) - { - - // receive 1 packet into the socket buffer - got = recvfrom ( udpdb.sock->fd, udpdb.sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); - - if (got == UDP_PAYLOAD) - { - udpdb.sock->have_packet = 1; - } - else if (got == -1) - { - errsv = errno; - if (errsv == EAGAIN) - { - udpdb.n_sleeps++; - if (udpdb.capture_started) - timeouts++; - if (timeouts > timeout_max) - syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); - } - else - { - syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); - return EXIT_FAILURE; - } - } - else // we received a packet of the WRONG size, ignore it - { - syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); - } - } - timeouts = 0; - - // we have a valid packet within the timeout - if (udpdb.sock->have_packet) - { - - // decode packet header (64 bits) - // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) - seq_no = 0; - seq_no |= (((uint64_t)(udpdb.sock->buf[4]) & 224) >> 5) & 7; - //seq_no &= 7; - seq_no |= (((uint64_t)(udpdb.sock->buf[3])) << 3) & 2040; - //seq_no &= 2047; - seq_no |= (((uint64_t)(udpdb.sock->buf[2])) << 11) & 522240; - //seq_no &= 524287; - seq_no |= (((uint64_t)(udpdb.sock->buf[1])) << 19) & 133693440; - //seq_no &= 134217727; - seq_no |= (((uint64_t)(udpdb.sock->buf[0])) << 27) & 34225520640; - //seq_no &= 34359738367; - /*seq_no = 0; - seq_no |= 224 >> 5; - seq_no |= 255 << 3; - seq_no |= 255 << 11; - seq_no |= 255 << 19;*/ - - /*ch_id = 0; - ch_id |= ((unsigned char) (udpdb.sock->buf[4]) & 31) << 8; - ch_id |= (unsigned char) (udpdb.sock->buf[5]);*/ - - ant_id = 0; - ant_id |= (unsigned char) (udpdb.sock->buf[6]) << 8; - ant_id |= (unsigned char) (udpdb.sock->buf[7]); - aid = ant_lookup[(int)(ant_id)]; - - if (UTC_START==0) UTC_START = seq_no + 10000; - - //act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3 + (ch_id-CHOFF)/384; // actual seq no - act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no - block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block - - // check for starting or stopping condition, using continue - //if (DEBUG) printf("%"PRIu64" %"PRIu64" %d\n",seq_no,act_seq_no,ch_id);//syslog(LOG_DEBUG, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id); - //if (seq_no == UTC_START && UTC_START != 10000 && ant_id == 0) canWrite=1; - if (canWrite==0) { - if (seq_no >= UTC_START-5 && UTC_START != 10000) ct_snaps++; - if (ct_snaps >= 32) canWrite=1; - } - //if (seq_no > UTC_START && UTC_START != 10000) canWrite=1; - udpdb.last_seq = seq_no; - //syslog(LOG_INFO,"SEQ_NO_DBG %"PRIu64"",seq_no); - if (canWrite == 0) continue; - //if (seq_no == UTC_STOP) canWrite=0; - //if (udpdb.packets->received<100) syslog(LOG_INFO, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id); - - // if first packet - if (!udpdb.capture_started) - { - //udpdb.block_start_byte = act_seq_no * UDP_DATA; - udpdb.block_start_byte = block_seq_no * UDP_DATA; - udpdb.block_end_byte = (udpdb.block_start_byte + udpdb.hdu_bufsz) - UDP_DATA; - udpdb.capture_started = 1; - - syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb.block_start_byte, udpdb.block_end_byte); - } - - // if capture running - if (udpdb.capture_started) - { - seq_byte = (act_seq_no * UDP_DATA); - - udpdb.last_byte = seq_byte; - - // if packet arrived too late, ignore - if (seq_byte < udpdb.block_start_byte) - { - //syslog (LOG_INFO, "receive_obs: seq_byte < block_start_byte: %"PRIu64", %"PRIu64"", seq_no, ant_id); - udpdb.packets->dropped++; - udpdb.bytes->dropped += UDP_DATA; - } - else - { - // packet belongs in this block - if (seq_byte <= udpdb.block_end_byte) - { - byte_offset = seq_byte - udpdb.block_start_byte; - memcpy (udpdb.block + byte_offset, udpdb.sock->buf + UDP_HEADER, UDP_DATA); - udpdb.packets->received++; - udpdb.bytes->received += UDP_DATA; - udpdb.block_count++; - } - // packet belongs in subsequent block - else - { - //syslog (LOG_INFO, "receive_obs: received packet for subsequent buffer: temp_idx=%d, ant_id=%d, seq_no=%"PRIu64"",temp_idx,ant_id,seq_no); - - if (temp_idx < temp_max) - { - // save packet to temp buffer - memcpy (temp_buffers[temp_idx], udpdb.sock->buf + UDP_HEADER, UDP_DATA); - temp_seq_byte[temp_idx] = seq_byte; - temp_idx++; - } - else - { - udpdb.packets->dropped++; - udpdb.bytes->dropped += UDP_DATA; - } - } - } - } - - // now check for a full buffer or full temp queue - if ((udpdb.block_count >= udpdb.packets_per_buffer) || (temp_idx >= temp_max)) - { - syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " - "ant_id=%"PRIu16", block_count=%"PRIu64", " - "temp_idx=%d\n", seq_no, ant_id, udpdb.block_count, - temp_idx); - - uint64_t dropped = udpdb.packets_per_buffer - udpdb.block_count; - if (dropped) - { - udpdb.packets->dropped += dropped; - udpdb.bytes->dropped += (dropped * UDP_DATA); - } - - if (dropped>1000) unhappies_ct++; - - // get a new buffer and write any temp packets saved - if (dsaX_udpdb_new_buffer (&udpdb) < 0) - { - syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb.block_start_byte, udpdb.block_end_byte); - - // include any futuristic packets we saved - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - udpdb.block_start_byte; - if (byte_offset < udpdb.hdu_bufsz) - { - memcpy (udpdb.block + byte_offset, temp_buffers[i], UDP_DATA); - udpdb.block_count++; - udpdb.packets->received++; - udpdb.bytes->received += UDP_DATA; - } - else - { - udpdb.packets->dropped++; - udpdb.bytes->dropped += UDP_DATA; - } - } - temp_idx = 0; - } - } - - // packet has been inserted or saved by this point - udpdb.sock->have_packet = 0; - - // deal with unhappy receiver - if (unhappies_ct > unhappies) { - - syslog(LOG_INFO, "Skipping some blocks..."); - - close(udpdb.sock->fd); - - for (int i=0;idropped += udpdb.packets_per_buffer; - udpdb.bytes->dropped += (udpdb.packets_per_buffer * UDP_DATA); - - if (dsaX_udpdb_new_buffer (&udpdb) < 0) - { - syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); - return EXIT_FAILURE; - } - - } - - sleep(sleeps); - - // prepare the socket - syslog(LOG_INFO, "re-preparing the socket dsaX_udpdb_prepare()"); - if (dsaX_udpdb_prepare (&udpdb) < 0) - { - syslog(LOG_ERR, "could allocate required resources (prepare)"); - return EXIT_FAILURE; - } - - unhappies_ct = 0; - - } - - } - - /* END WHAT WAS IN RECV THREAD */ - - - // close threads - syslog(LOG_INFO, "joining control_thread and stats_thread"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - pthread_join (stats_thread_id, &result); - - free(temp_seq_byte); - free(temp_buffers); - - dsaX_dbgpu_cleanup (hdu_out); - -} diff --git a/src/dsaX_capture_manythread.c b/src/dsaX_capture_manythread.c deleted file mode 100644 index b9f14bd..0000000 --- a/src/dsaX_capture_manythread.c +++ /dev/null @@ -1,1115 +0,0 @@ -/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer. - -main: runs capture loop, and interfaces dada buffer -control_thread: deals with control commands - -*/ - -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture_manythread.h" -#include "dsaX_def.h" - -/* global variables */ -int dPort, cPort; -int quit_threads = 0; -char STATE[20]; -uint64_t UTC_START = 10000; -uint64_t UTC_STOP = 40000000000; -int MONITOR = 0; -char iP[100]; -int DEBUG = 0; -int HISTOGRAM[16]; -int writeBlock = 0; -const int nth = 4; -const int nwth = 2; -int cores[16] = {10,12,11,13,30,31,32,33}; -int write_cores[8] = {14,15,34,35}; -pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; -volatile int blockStatus[64]; -volatile int skipBlock = 0; -volatile int skipping = 0; -volatile int lWriteBlock = 0; -volatile int write_ct = 0; -volatile uint64_t last_seq = 0; -volatile int skipct = 0; -volatile uint64_t block_count = 0; -volatile uint64_t block_start_byte=0, block_end_byte=0; -volatile unsigned capture_started = 0; -volatile char * wblock; - -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); -void usage(); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - - - -} - -void usage() -{ - fprintf (stdout, - "dsaX_capture [options]\n" - " -c core bind process to CPU core [no default]\n" - " -j IP to listen on for data packets [no default]\n" - " -i IP to listen on for control commands [no default]\n" - " -p PORT for data\n" - " -q PORT for control\n" - " -f filename of template dada header [no default]\n" - " -o out_key [default CAPTURE_BLOCK_KEY]\n" - " -d send debug messages to syslog\n" - " -g chgroup [default 0]\n" - " -h print usage\n"); -} - -// open a socket -dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx); -dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx) -{ - - // prepare structure - syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure"); - dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); - assert(b != NULL); - b->bufsz = sizeof(char) * UDP_PAYLOAD; - b->buf = (char *) malloc (b->bufsz); - assert(b->buf != NULL); - b->have_packet = 0; - b->fd = 0; - - // connect to socket - syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, dPort); - - // open socket - syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, dPort); - b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - assert(b->fd>=0); - - // for multiple connections - int one = 1; - setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one)); - - struct sockaddr_in udp_sock; - bzero(&(udp_sock.sin_zero), 8); // clear the struct - udp_sock.sin_family = AF_INET; // internet/IP - udp_sock.sin_port = htons(dPort); // set the port number - udp_sock.sin_addr.s_addr = inet_addr(ctx->interface); // from a specific IP address - - if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) { - syslog(LOG_ERR, "prepare: failed to bind to socket"); - return -1; - } - - // set the socket size to 64 MB - int sock_buf_size = 64*1024*1024; - syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); - dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size); - - // set the socket to non-blocking - syslog(LOG_INFO, "prepare: setting non_block"); - sock_nonblock(b->fd); - - // clear any packets buffered by the kernel - syslog(LOG_INFO, "prepare: clearing packets at socket"); - size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD); - - // clear blockStatus - for (int i=0;i<64;i++) blockStatus[i] = 0; - - return b; -} - - - -// close a socket -void dsaX_free_sock(dsaX_sock_t* b); -void dsaX_free_sock(dsaX_sock_t* b) -{ - b->fd = 0; - b->bufsz = 0; - b->have_packet =0; - if (b->buf) - free (b->buf); - b->buf = 0; -} - -/* - * open a data block buffer ready for direct access - */ -int dsaX_udpdb_open_buffer (dsaX_write_t * ctx); -int dsaX_udpdb_open_buffer (dsaX_write_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); - - if (ctx->block_open) - { - syslog (LOG_ERR, "open_buffer: buffer already opened"); - return -1; - } - - if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); - - uint64_t block_id = 0; - - wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); - if (!wblock) - { - syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); - return -1; - } - - ctx->block_open = 1; - - return 0; -} - -/* - * close a data buffer, assuming a full block has been written - */ -int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod); -int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); - - if (!ctx->block_open) - { - syslog (LOG_ERR, "close_buffer: buffer already closed"); - return -1; - } - - // log any buffers that are not full, except for the 1 byte "EOD" buffer - if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) - syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " - "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", - bytes_written, ctx->hdu_bufsz); - - if (eod) - { - if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); - return -1; - } - } - else - { - if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); - return -1; - } - } - - wblock = 0; - ctx->block_open = 0; - - return 0; -} - -/* - * move to the next ring buffer element. return pointer to base address of new buffer - */ -int dsaX_udpdb_new_buffer (dsaX_write_t * ctx); -int dsaX_udpdb_new_buffer (dsaX_write_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); - - if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); - return -1; - } - - if (dsaX_udpdb_open_buffer (ctx) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); - return -1; - } - - return 0; - -} - -// increment counters when block is full -void dsaX_udpdb_increment (udpdb_t * ctx); -void dsaX_udpdb_increment (udpdb_t * ctx) -{ - - // increment buffer byte markers - writeBlock++; - block_start_byte = block_end_byte + UDP_DATA; - block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; - block_count = 0; - -} - - - -/* --------- THREADS -------- */ - -// STATS THREAD - -/* - * Thread to print simple capture statistics - */ -void stats_thread(void * arg) { - - dsaX_stats_t * ctx = (dsaX_stats_t *) arg; - uint64_t b_rcv_total = 0; - uint64_t b_rcv_1sec = 0; - uint64_t b_rcv_curr = 0; - - uint64_t b_drp_total = 0; - uint64_t b_drp_1sec = 0; - uint64_t b_drp_curr = 0; - - uint64_t s_rcv_total = 0; - uint64_t s_rcv_1sec = 0; - uint64_t s_rcv_curr = 0; - - uint64_t ooo_pkts = 0; - float gb_rcv_ps = 0; - float mb_rcv_ps = 0; - float mb_drp_ps = 0; - - syslog(LOG_INFO,"starting stats thread..."); - sleep(2); - syslog(LOG_INFO,"started stats thread..."); - - while (!quit_threads) - { - - /* get a snapshot of the data as quickly as possible */ - b_rcv_curr = ctx->bytes->received; - b_drp_curr = ctx->bytes->dropped; - - /* calc the values for the last second */ - b_rcv_1sec = b_rcv_curr - b_rcv_total; - b_drp_1sec = b_drp_curr - b_drp_total; - - /* update the totals */ - b_rcv_total = b_rcv_curr; - b_drp_total = b_drp_curr; - - mb_rcv_ps = (double) b_rcv_1sec / 1000000; - mb_drp_ps = (double) b_drp_1sec / 1000000; - gb_rcv_ps = b_rcv_1sec * 8; - gb_rcv_ps /= 1000000000; - - /* determine how much memory is free in the receivers */ - syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct); - - sleep(1); - } - -} - -// CONTROL THREAD - -void control_thread (void * arg) { - - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = cPort; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - - // INTERPRET BUFFER STRING - // receive either UTC_START, UTC_STOP, MONITOR - - // interpret buffer string - char * rest = buffer; - char *cmd, *val; - cmd = strtok_r(rest, "-", &rest); - val = strtok_r(rest, "-", &rest); - syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); - - if (strcmp(cmd,"UTC_START")==0) - UTC_START = strtoull(val,&endptr,0); - - if (strcmp(cmd,"UTC_STOP")==0) - UTC_STOP = strtoull(val,&endptr,0); - - close(fd); - - } - - free (buffer); - - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -/* - * Thread to capture data - */ -int recv_thread(void * arg) { - - udpdb_t * udpdb = (udpdb_t *) arg; - int thread_id = udpdb->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - int core_id; - if (dPort==4011) - core_id = cores[thread_id]; - else - core_id = cores[thread_id+nth]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - // set up socket - dsaX_sock_t * sock = dsaX_make_sock(udpdb); - - // lookup table for ant order - uint64_t ant_lookup[100], vv; - for (int i=0;i<100;i++) ant_lookup[i] = 0; - for (int i=0;ibuf; - size_t got = 0; // data received from a recv_from call - int errsv; // determine the sequence number boundaries for curr and next buffers - int64_t byte_offset = 0; // offset of current packet in bytes from start of block - uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs - // for "saving" out of order packets near edges of blocks - unsigned int temp_idx = 0; - unsigned int temp_max = 500; - char ** temp_buffers; - uint64_t * temp_seq_byte; - temp_buffers = (char **)malloc(sizeof(char *)*temp_max); - for (int i=0;ihave_packet = 0; - - // incredibly tight loop to try and get a packet - while (!sock->have_packet) - { - - // receive 1 packet into the socket buffer - got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); - - if (got == UDP_PAYLOAD) - { - sock->have_packet = 1; - } - else if (got == -1) - { - errsv = errno; - if (errsv == EAGAIN) - { - if (capture_started) - timeouts++; - //if (timeouts > timeout_max) - //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); - } - else - { - //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); - return EXIT_FAILURE; - } - } - else // we received a packet of the WRONG size, ignore it - { - syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); - } - } - timeouts = 0; - - // we have a valid packet within the timeout - if (sock->have_packet) - { - - // decode packet header (64 bits) - // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) - seq_no = 0; - seq_no |= (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7; - seq_no |= (((uint64_t)(sock->buf[3])) << 3) & 2040; - seq_no |= (((uint64_t)(sock->buf[2])) << 11) & 522240; - seq_no |= (((uint64_t)(sock->buf[1])) << 19) & 133693440; - seq_no |= (((uint64_t)(sock->buf[0])) << 27) & 34225520640; - ant_id = 0; - ant_id |= (unsigned char) (sock->buf[6]) << 8; - ant_id |= (unsigned char) (sock->buf[7]); - aid = ant_lookup[(int)(ant_id)]; - //aid = ant_id/3; - - if (UTC_START==0) UTC_START = seq_no+30000; - - act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no - block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block - - // set shared last_seq - pthread_mutex_lock(&mutex); - last_seq = seq_no; - //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq); - pthread_mutex_unlock(&mutex); - - // check for starting or stopping condition, using continue - if (canWrite==0) { - if (seq_no >= UTC_START-50 && UTC_START != 10000) { - canWrite=1; - } - } - if (canWrite == 0) continue; - - // threadsafe start of capture - pthread_mutex_lock(&mutex); - if (!(capture_started)) - { - block_start_byte = block_seq_no * UDP_DATA; - block_end_byte = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; - capture_started = 1; - - syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte); - } - pthread_mutex_unlock(&mutex); - - // if capture running - if (capture_started) - { - seq_byte = (act_seq_no * UDP_DATA); - tpack++; - - // packet belongs in this block - if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte)) - { - byte_offset = seq_byte - (block_start_byte); - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA); - pthread_mutex_lock(&mutex); - block_count++; - //syslog(LOG_INFO,"block count %"PRIu64"",block_count); - pthread_mutex_unlock(&mutex); - - } - // packet belongs in subsequent block - else if (seq_byte > block_end_byte) - { - - if (temp_idx < temp_max) - { - // save packet to temp buffer - memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA); - temp_seq_byte[temp_idx] = seq_byte; - temp_idx++; - } - } - // packet is too late - /*else - { - if (ctAnts<100) { - syslog (LOG_INFO, "receive_obs: TOO LATE %"PRIu64" %"PRIu64"", seq_no, ant_id); - ctAnts++; - } - }*/ - } - - // threadsafe end of block - pthread_mutex_lock(&mutex); - if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) - { - syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", " - "ant_id=%"PRIu16", block_count=%"PRIu64", " - "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id, block_count, - temp_idx,writeBlock); - - // write block - // check whether doWrite has been released. If not, skip this block - if (blockStatus[writeBlock % 64] > 0) - blockStatus[writeBlock % 64] += 1; - else - blockStatus[writeBlock % 64] = 1; - - uint64_t dropped = udpdb->packets_per_buffer - (block_count); - udpdb->packets->received += (block_count); - udpdb->bytes->received += (block_count) * UDP_DATA; - if (dropped) - { - udpdb->packets->dropped += dropped; - udpdb->bytes->dropped += (dropped * UDP_DATA); - } - - // increment counters - dsaX_udpdb_increment(udpdb); - ctAnts = 0; - - // write temp queue for this thread - //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); - tpack = 0; - - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - (block_start_byte); - if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) - { - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); - //pthread_mutex_lock(&mutex); - block_count++; - //pthread_mutex_unlock(&mutex); - } - } - temp_idx = 0; - - } - pthread_mutex_unlock(&mutex); - - // at this stage, can try and write temp queue safely for other threads - if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0) - { - //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); - tpack = 0; - - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - (block_start_byte); - if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) - { - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); - pthread_mutex_lock(&mutex); - block_count++; - pthread_mutex_unlock(&mutex); - } - } - temp_idx = 0; - - } - - } - - // packet has been inserted or saved by this point - sock->have_packet = 0; - - } - - dsaX_free_sock(sock); - free(temp_buffers); - free(temp_seq_byte); - -} - -/* - * Thread to write data - */ -void write_thread(void * arg) { - - dsaX_write_t * udpdb = (dsaX_write_t *) arg; - int thread_id = udpdb->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - int core_id; - if (dPort==4011) - core_id = write_cores[thread_id]; - else - core_id = write_cores[thread_id+nwth]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - int mod_WB = 0; - int a; - - while (!quit_threads) - { - - mod_WB = lWriteBlock % 64; - - while (blockStatus[mod_WB]==0) { - a=1; - } - - // assume everything is set up - // wblock is assigned, write_ct=0 - - memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth); - - pthread_mutex_lock(&mutex); - write_ct++; - pthread_mutex_unlock(&mutex); - - //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id); - - // now wait until thread 0 has finished getting a new block before moving on - if (thread_id>0) { - while (write_ct!=0) a=1; - } - else { - - // wait for all sub-blocks to be written - while (write_ct= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // OPEN CONNECTION TO DADA DB FOR WRITING - - if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - - hdu_out = dada_hdu_create (0); - if (DEBUG) syslog(DEBUG,"Created hdu"); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog(LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_out); - syslog(LOG_ERR,"could not lock to output dada buffer"); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"opened connection to output DB"); - - // DEAL WITH DADA HEADER - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - if (DEBUG) syslog(DEBUG,"read header2"); - - if (fileread (dada_fnam, hout, 4096) < 0) - { - free (hout); - syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); - return (EXIT_FAILURE); - } - - - if (DEBUG) syslog(DEBUG,"read header3"); - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // copy the in header to the out header - memcpy (header_out, hout, 4096); - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) - { - syslog(LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - sprintf(STATE,"LISTEN"); - syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); - - - /* time to start up receiver. - data are captured on iface:CAPTURE_PORT - */ - - // make recv, write, and stats structs - udpdb_t udpdb[nth]; - dsaX_stats_t stats; - dsaX_write_t writey[nwth]; - - // shared variables and memory - uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - char * tblock = (char *)malloc(sizeof(char)*bufsz*64); - stats_t * packets = init_stats_t(); - stats_t * bytes = init_stats_t(); - reset_stats_t(packets); - reset_stats_t(bytes); - - // initialise stats struct - stats.packets = packets; - stats.bytes = bytes; - - // initialise writey struct and open buffer - for (int i=0;idata_block); - writey[i].block_open = 0; - writey[i].tblock = tblock; - writey[i].thread_id = i; - } - dsaX_udpdb_open_buffer (&writey[0]); - - // initialise all udpdb structs - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture_manythread.h" -#include "dsaX_def.h" - -/* global variables */ -int quit_threads = 0; -char STATE[20]; -uint64_t UTC_START = 10000; -uint64_t UTC_STOP = 40000000000; -int MONITOR = 0; -char iP[100]; -int DEBUG = 0; -int HISTOGRAM[16]; -int writeBlock = 0; -const int nth = 8; -const int nwth = 4; -int cores[8] = {30,31,32,33,34,35,36,37}; -int write_cores[4] = {17,18,19,39}; -pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; -volatile int doWrite = 0; -volatile int skipBlock = 0; -volatile int skipping = 0; -volatile int lWriteBlock = 0; -volatile int write_ct = 0; -volatile uint64_t last_seq = 0; -volatile int skipct = 0; -volatile uint64_t block_count = 0; -volatile uint64_t block_start_byte=0, block_end_byte=0; -volatile unsigned capture_started = 0; -volatile char * wblock; - -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); -void usage(); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - - - -} - -void usage() -{ - fprintf (stdout, - "dsaX_capture [options]\n" - " -c core bind process to CPU core [no default]\n" - " -j IP to listen on for data packets [no default]\n" - " -i IP to listen on for control commands [no default]\n" - " -f filename of template dada header [no default]\n" - " -o out_key [default CAPTURE_BLOCK_KEY]\n" - " -d send debug messages to syslog\n" - " -g chgroup [default 0]\n" - " -h print usage\n"); -} - -// open a socket -dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx); -dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx) -{ - - // prepare structure - syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure"); - dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); - assert(b != NULL); - b->bufsz = sizeof(char) * UDP_PAYLOAD; - b->buf = (char *) malloc (b->bufsz); - assert(b->buf != NULL); - b->have_packet = 0; - b->fd = 0; - - // connect to socket - syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, ctx->port); - - // open socket - syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); - b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - assert(b->fd>=0); - - // for multiple connections - int one = 1; - setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one)); - - struct sockaddr_in udp_sock; - bzero(&(udp_sock.sin_zero), 8); // clear the struct - udp_sock.sin_family = AF_INET; // internet/IP - udp_sock.sin_port = htons(ctx->port); // set the port number - udp_sock.sin_addr.s_addr = inet_addr(ctx->interface); // from a specific IP address - - if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) { - syslog(LOG_ERR, "prepare: failed to bind to socket"); - return -1; - } - - // set the socket size to 256 MB - int sock_buf_size = 256*1024*1024; - syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); - dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size); - - // set the socket to non-blocking - syslog(LOG_INFO, "prepare: setting non_block"); - sock_nonblock(b->fd); - - // clear any packets buffered by the kernel - syslog(LOG_INFO, "prepare: clearing packets at socket"); - size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD); - - return b; -} - - - -// close a socket -void dsaX_free_sock(dsaX_sock_t* b); -void dsaX_free_sock(dsaX_sock_t* b) -{ - b->fd = 0; - b->bufsz = 0; - b->have_packet =0; - if (b->buf) - free (b->buf); - b->buf = 0; -} - -/* - * open a data block buffer ready for direct access - */ -int dsaX_udpdb_open_buffer (dsaX_write_t * ctx); -int dsaX_udpdb_open_buffer (dsaX_write_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); - - if (ctx->block_open) - { - syslog (LOG_ERR, "open_buffer: buffer already opened"); - return -1; - } - - if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); - - uint64_t block_id = 0; - - wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); - if (!wblock) - { - syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); - return -1; - } - - ctx->block_open = 1; - - return 0; -} - -/* - * close a data buffer, assuming a full block has been written - */ -int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod); -int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); - - if (!ctx->block_open) - { - syslog (LOG_ERR, "close_buffer: buffer already closed"); - return -1; - } - - // log any buffers that are not full, except for the 1 byte "EOD" buffer - if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) - syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " - "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", - bytes_written, ctx->hdu_bufsz); - - if (eod) - { - if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); - return -1; - } - } - else - { - if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); - return -1; - } - } - - wblock = 0; - ctx->block_open = 0; - - return 0; -} - -/* - * move to the next ring buffer element. return pointer to base address of new buffer - */ -int dsaX_udpdb_new_buffer (dsaX_write_t * ctx); -int dsaX_udpdb_new_buffer (dsaX_write_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); - - if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); - return -1; - } - - if (dsaX_udpdb_open_buffer (ctx) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); - return -1; - } - - return 0; - -} - -// increment counters when block is full -void dsaX_udpdb_increment (udpdb_t * ctx); -void dsaX_udpdb_increment (udpdb_t * ctx) -{ - - // increment buffer byte markers - writeBlock++; - block_start_byte = block_end_byte + UDP_DATA; - block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; - block_count = 0; - -} - - - -/* --------- THREADS -------- */ - -// STATS THREAD - -/* - * Thread to print simple capture statistics - */ -void stats_thread(void * arg) { - - dsaX_stats_t * ctx = (dsaX_stats_t *) arg; - uint64_t b_rcv_total = 0; - uint64_t b_rcv_1sec = 0; - uint64_t b_rcv_curr = 0; - - uint64_t b_drp_total = 0; - uint64_t b_drp_1sec = 0; - uint64_t b_drp_curr = 0; - - uint64_t s_rcv_total = 0; - uint64_t s_rcv_1sec = 0; - uint64_t s_rcv_curr = 0; - - uint64_t ooo_pkts = 0; - float gb_rcv_ps = 0; - float mb_rcv_ps = 0; - float mb_drp_ps = 0; - - syslog(LOG_INFO,"starting stats thread..."); - sleep(2); - syslog(LOG_INFO,"started stats thread..."); - - while (!quit_threads) - { - - /* get a snapshot of the data as quickly as possible */ - b_rcv_curr = ctx->bytes->received; - b_drp_curr = ctx->bytes->dropped; - - /* calc the values for the last second */ - b_rcv_1sec = b_rcv_curr - b_rcv_total; - b_drp_1sec = b_drp_curr - b_drp_total; - - /* update the totals */ - b_rcv_total = b_rcv_curr; - b_drp_total = b_drp_curr; - - mb_rcv_ps = (double) b_rcv_1sec / 1000000; - mb_drp_ps = (double) b_drp_1sec / 1000000; - gb_rcv_ps = b_rcv_1sec * 8; - gb_rcv_ps /= 1000000000; - - /* determine how much memory is free in the receivers */ - syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct); - - sleep(1); - } - -} - -// CONTROL THREAD - -void control_thread (void * arg) { - - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = CAPTURE_CONTROL_PORT; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - - // INTERPRET BUFFER STRING - // receive either UTC_START, UTC_STOP, MONITOR - - // interpret buffer string - char * rest = buffer; - char *cmd, *val; - cmd = strtok_r(rest, "-", &rest); - val = strtok_r(rest, "-", &rest); - syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); - - if (strcmp(cmd,"UTC_START")==0) - UTC_START = strtoull(val,&endptr,0); - - if (strcmp(cmd,"UTC_STOP")==0) - UTC_STOP = strtoull(val,&endptr,0); - - close(fd); - - } - - free (buffer); - - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -/* - * Thread to capture data - */ -void recv_thread(void * arg) { - - udpdb_t * udpdb = (udpdb_t *) arg; - int thread_id = udpdb->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - // set up socket - dsaX_sock_t * sock = dsaX_make_sock(udpdb); - - // DEFINITIONS - uint64_t tpack = 0; - uint64_t act_seq_no = 0; - uint64_t block_seq_no = 0; - uint64_t seq_no = 0; - uint64_t ant_id = 0; - unsigned char * b = (unsigned char *) sock->buf; - size_t got = 0; // data received from a recv_from call - int errsv; // determine the sequence number boundaries for curr and next buffers - int64_t byte_offset = 0; // offset of current packet in bytes from start of block - uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs - // for "saving" out of order packets near edges of blocks - unsigned int temp_idx = 0; - unsigned int temp_max = 500; - char ** temp_buffers; - uint64_t * temp_seq_byte; - temp_buffers = (char **)malloc(sizeof(char *)*temp_max); - for (int i=0;ihave_packet = 0; - - // incredibly tight loop to try and get a packet - while (!sock->have_packet) - { - - // receive 1 packet into the socket buffer - got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); - - if (got == UDP_PAYLOAD) - { - sock->have_packet = 1; - } - else if (got == -1) - { - errsv = errno; - if (errsv == EAGAIN) - { - if (capture_started) - timeouts++; - //if (timeouts > timeout_max) - //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); - } - else - { - //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); - return EXIT_FAILURE; - } - } - else // we received a packet of the WRONG size, ignore it - { - syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD); - } - } - timeouts = 0; - - // we have a valid packet within the timeout - if (sock->have_packet) - { - - // decode packet header (64 bits) - // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) - seq_no = 0; - seq_no |= (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7; - seq_no |= (((uint64_t)(sock->buf[3])) << 3) & 2040; - seq_no |= (((uint64_t)(sock->buf[2])) << 11) & 522240; - seq_no |= (((uint64_t)(sock->buf[1])) << 19) & 133693440; - seq_no |= (((uint64_t)(sock->buf[0])) << 27) & 34225520640; - ant_id = 0; - ant_id |= (unsigned char) (sock->buf[6]) << 8; - ant_id |= (unsigned char) (sock->buf[7]); - - act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no - block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block - - // set shared last_seq - pthread_mutex_lock(&mutex); - last_seq = seq_no; - //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq); - pthread_mutex_unlock(&mutex); - - // check for starting or stopping condition, using continue - if (canWrite==0) { - if (seq_no >= UTC_START-50 && UTC_START != 10000) { - canWrite=1; - } - } - if (canWrite == 0) continue; - - // threadsafe start of capture - pthread_mutex_lock(&mutex); - if (!(capture_started)) - { - block_start_byte = block_seq_no * UDP_DATA; - block_end_byte = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; - capture_started = 1; - - syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte); - } - pthread_mutex_unlock(&mutex); - - // if capture running - if (capture_started) - { - seq_byte = (act_seq_no * UDP_DATA); - tpack++; - - // packet belongs in this block - if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte)) - { - byte_offset = seq_byte - (block_start_byte); - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA); - pthread_mutex_lock(&mutex); - block_count++; - //syslog(LOG_INFO,"block count %"PRIu64"",block_count); - pthread_mutex_unlock(&mutex); - - } - // packet belongs in subsequent block - else if (seq_byte > block_end_byte) - { - - if (temp_idx < temp_max) - { - // save packet to temp buffer - memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA); - temp_seq_byte[temp_idx] = seq_byte; - temp_idx++; - } - } - } - - // threadsafe end of block - pthread_mutex_lock(&mutex); - if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) - { - syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", " - "ant_id=%"PRIu16", block_count=%"PRIu64", " - "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id, block_count, - temp_idx,writeBlock); - - // write block - // check whether doWrite has been released. If not, skip this block - if (doWrite==1) skipBlock=1; - else doWrite=1; - - uint64_t dropped = udpdb->packets_per_buffer - (block_count); - udpdb->packets->received += (block_count); - udpdb->bytes->received += (block_count) * UDP_DATA; - if (dropped) - { - udpdb->packets->dropped += dropped; - udpdb->bytes->dropped += (dropped * UDP_DATA); - } - - // increment counters - dsaX_udpdb_increment(udpdb); - - // write temp queue for this thread - //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); - tpack = 0; - - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - (block_start_byte); - if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) - { - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); - //pthread_mutex_lock(&mutex); - block_count++; - //pthread_mutex_unlock(&mutex); - } - } - temp_idx = 0; - - } - pthread_mutex_unlock(&mutex); - - // at this stage, can try and write temp queue safely for other threads - if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0) - { - //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx); - tpack = 0; - - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - (block_start_byte); - if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) - { - mod_WB = writeBlock % 64; - memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); - pthread_mutex_lock(&mutex); - block_count++; - pthread_mutex_unlock(&mutex); - } - } - temp_idx = 0; - - } - - } - - // packet has been inserted or saved by this point - sock->have_packet = 0; - - } - - dsaX_free_sock(sock); - free(temp_buffers); - free(temp_seq_byte); - -} - -/* - * Thread to write data - */ -void write_thread(void * arg) { - - dsaX_write_t * udpdb = (dsaX_write_t *) arg; - int thread_id = udpdb->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = write_cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - int mod_WB = 0; - int a; - - while (!quit_threads) - { - - while (!doWrite) { - a=1; - } - - // assume everything is set up - // wblock is assigned, write_ct=0 - - mod_WB = lWriteBlock % 64; - memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth); - - pthread_mutex_lock(&mutex); - write_ct++; - pthread_mutex_unlock(&mutex); - - //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id); - - // now wait until thread 0 has finished getting a new block before moving on - if (thread_id>0) { - while (write_ct!=0) a=1; - } - else { - - // wait for all sub-blocks to be written - while (write_ct= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // OPEN CONNECTION TO DADA DB FOR WRITING - - if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - - hdu_out = dada_hdu_create (); - if (DEBUG) syslog(DEBUG,"Created hdu"); - dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); - if (dada_hdu_connect (hdu_out) < 0) { - syslog(LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_out); - syslog(LOG_ERR,"could not lock to output dada buffer"); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"opened connection to output DB"); - - // DEAL WITH DADA HEADER - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - if (DEBUG) syslog(DEBUG,"read header2"); - - if (fileread (dada_fnam, hout, 4096) < 0) - { - free (hout); - syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); - return (EXIT_FAILURE); - } - - - if (DEBUG) syslog(DEBUG,"read header3"); - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // copy the in header to the out header - memcpy (header_out, hout, 4096); - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) - { - syslog(LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - sprintf(STATE,"LISTEN"); - syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); - - - /* time to start up receiver. - data are captured on iface:CAPTURE_PORT - */ - - // make recv, write, and stats structs - udpdb_t udpdb[nth]; - dsaX_stats_t stats; - dsaX_write_t writey[nwth]; - - // shared variables and memory - uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - char * tblock = (char *)malloc(sizeof(char)*bufsz*64); - stats_t * packets = init_stats_t(); - stats_t * bytes = init_stats_t(); - reset_stats_t(packets); - reset_stats_t(bytes); - - // initialise stats struct - stats.packets = packets; - stats.bytes = bytes; - - // initialise writey struct and open buffer - for (int i=0;idata_block); - writey[i].block_open = 0; - writey[i].tblock = tblock; - writey[i].thread_id = i; - } - dsaX_udpdb_open_buffer (&writey[0]); - - // initialise all udpdb structs - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture_pcap.h" -#include "dsaX_def.h" -#include "pcap.h" - -/* global variables */ -int quit_threads = 0; -char STATE[20]; -uint64_t UTC_START = 10000; -uint64_t UTC_STOP = 40000000000; -int MONITOR = 0; -char iP[100]; -int DEBUG = 0; -int HISTOGRAM[16]; -int cores[2] = {17,19}; -pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; -volatile int canWrite = 0; -volatile unsigned capture_started = 0; -volatile char * wblock; -volatile uint64_t last_seq; -const int nth = 1; -const int nwth = 1; -const int TEMP_MAXY = 1000; -volatile int skipped = 0; -const int NBLOCKS = 8; -volatile uint64_t writeBlock[8] = {0, 0, 0, 0, 0, 0, 0, 0}; -volatile int delayBlock = 0; -volatile int behindBlock = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); -void usage(); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_capture [options]\n" - " -c core bind process to CPU core [no default]\n" - " -i IP to listen on for control commands [no default]\n" - " -f filename of template dada header [no default]\n" - " -o out_key [default CAPTURE_BLOCK_KEY]\n" - " -d send debug messages to syslog\n" - " -h print usage\n"); -} - -/* - * open a data block buffer ready for direct access - */ -int dsaX_udpdb_open_buffer (dsaX_t * ctx); -int dsaX_udpdb_open_buffer (dsaX_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); - - if (ctx->block_open) - { - syslog (LOG_ERR, "open_buffer: buffer already opened"); - return -1; - } - - if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); - - uint64_t block_id = 0; - - wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id); - if (!wblock) - { - syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); - return -1; - } - - ctx->block_open = 1; - - return 0; -} - -/* - * close a data buffer, assuming a full block has been written - */ -int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod); -int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); - - if (!ctx->block_open) - { - syslog (LOG_ERR, "close_buffer: buffer already closed"); - return -1; - } - - // log any buffers that are not full, except for the 1 byte "EOD" buffer - if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) - syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " - "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", - bytes_written, ctx->hdu_bufsz); - - if (eod) - { - if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); - return -1; - } - } - else - { - if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); - return -1; - } - } - - wblock = 0; - ctx->block_open = 0; - - return 0; -} - -/* - * move to the next ring buffer element. return pointer to base address of new buffer - */ -int dsaX_udpdb_new_buffer (dsaX_t * ctx); -int dsaX_udpdb_new_buffer (dsaX_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); - - if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); - return -1; - } - - if (dsaX_udpdb_open_buffer (ctx) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); - return -1; - } - - return 0; - -} - -// increment counters when block is full -void dsaX_udpdb_increment (dsaX_t * ctx); -void dsaX_udpdb_increment (dsaX_t * ctx) -{ - - // increment buffer byte markers - ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; - ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; - ctx->block_count = 0; - -} - - - -/* --------- THREADS -------- */ - -// STATS THREAD - -/* - * Thread to print simple capture statistics - */ -void stats_thread(void * arg) { - - dsaX_stats_t * ctx = (dsaX_stats_t *) arg; - uint64_t b_rcv_total = 0; - uint64_t b_rcv_1sec = 0; - uint64_t b_rcv_curr = 0; - - uint64_t b_drp_total = 0; - uint64_t b_drp_1sec = 0; - uint64_t b_drp_curr = 0; - - uint64_t s_rcv_total = 0; - uint64_t s_rcv_1sec = 0; - uint64_t s_rcv_curr = 0; - - uint64_t ooo_pkts = 0; - float gb_rcv_ps = 0; - float mb_rcv_ps = 0; - float mb_drp_ps = 0; - - syslog(LOG_INFO,"starting stats thread..."); - sleep(2); - syslog(LOG_INFO,"started stats thread..."); - - while (!quit_threads) - { - - /* get a snapshot of the data as quickly as possible */ - b_rcv_curr = ctx->bytes->received; - b_drp_curr = ctx->bytes->dropped; - - /* calc the values for the last second */ - b_rcv_1sec = b_rcv_curr - b_rcv_total; - b_drp_1sec = b_drp_curr - b_drp_total; - - /* update the totals */ - b_rcv_total = b_rcv_curr; - b_drp_total = b_drp_curr; - - mb_rcv_ps = (double) b_rcv_1sec / 1000000; - mb_drp_ps = (double) b_drp_1sec / 1000000; - gb_rcv_ps = b_rcv_1sec * 8; - gb_rcv_ps /= 1000000000; - - /* determine how much memory is free in the receivers */ - syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, behindBlock, skipped); - - sleep(1); - } - -} - -// CONTROL THREAD - -void control_thread (void * arg) { - - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = CAPTURE_CONTROL_PORT; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - - // INTERPRET BUFFER STRING - // receive either UTC_START, UTC_STOP, MONITOR - - // interpret buffer string - char * rest = buffer; - char *cmd, *val; - cmd = strtok_r(rest, "-", &rest); - val = strtok_r(rest, "-", &rest); - syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); - - if (strcmp(cmd,"UTC_START")==0) - UTC_START = strtoull(val,&endptr,0); - - if (strcmp(cmd,"UTC_STOP")==0) - UTC_STOP = strtoull(val,&endptr,0); - - close(fd); - - } - - free (buffer); - - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - -/* -This is important - packet callback function to place packets in buffer -called upon single packet being received -*/ -void packet_callback(u_char *args, const struct pcap_pkthdr* header, const u_char* packet) { - - dsaX_t * udpdb = (dsaX_t *) args; - - // make sure packet has right length and get payload - if (header->len != UDP_PAYLOAD + 42) { - syslog(LOG_INFO,"received packet with length %d, total available %d",header->len,header->caplen); - return; - } - char *buf = (char *)(packet + 42); - - // process packet header - uint64_t seq_no=0, ant_id=0; - seq_no |= (((uint64_t)(buf[4]) & 224) >> 5) & 7; - seq_no |= (((uint64_t)(buf[3])) << 3) & 2040; - seq_no |= (((uint64_t)(buf[2])) << 11) & 522240; - seq_no |= (((uint64_t)(buf[1])) << 19) & 133693440; - seq_no |= (((uint64_t)(buf[0])) << 27) & 34225520640; - ant_id |= (unsigned char) (buf[6]) << 8; - ant_id |= (unsigned char) (buf[7]); - uint64_t act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no - uint64_t block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block - last_seq = seq_no; - - // check for starting condition - if (canWrite==0) { - if (seq_no >= UTC_START-500 && UTC_START != 10000) { - canWrite=1; - } - } - if (canWrite == 0) return; - - // deal with start of capture - if (!(capture_started)) - { - udpdb->block_start_byte = block_seq_no * UDP_DATA; - udpdb->block_end_byte = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; - capture_started = 1; - syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte); - } - - // if capture has started, do good stuff - uint64_t byte_offset, seq_byte; - if (capture_started) { - - seq_byte = (act_seq_no * UDP_DATA); - - // packet belongs in this block - if ((seq_byte <= udpdb->block_end_byte) && (seq_byte >= udpdb->block_start_byte)) - { - byte_offset = seq_byte - (udpdb->block_start_byte); - memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, buf + UDP_HEADER, UDP_DATA); - //memcpy(wblock + byte_offset, buf + UDP_HEADER, UDP_DATA); - udpdb->block_count++; - } - // packet belongs in subsequent block - else if (seq_byte > udpdb->block_end_byte) - { - if (udpdb->temp_idx < TEMP_MAXY) - { - // save packet to temp buffer - memcpy (udpdb->temp_buffers + udpdb->temp_idx*UDP_DATA, buf + UDP_HEADER, UDP_DATA); - udpdb->temp_seq_byte[udpdb->temp_idx] = seq_byte; - udpdb->temp_idx++; - } - } - } - - // end of block - if ((udpdb->block_count >= udpdb->packets_per_buffer) || (udpdb->temp_idx >= TEMP_MAXY)) - { - syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " - "ant_id=%"PRIu16", block_count=%"PRIu64", " - "temp_idx=%d", seq_no, ant_id, - udpdb->block_count, udpdb->temp_idx); - - // set write block on this block - if (writeBlock[udpdb->tblock_idx]==1) - skipped++; - writeBlock[udpdb->tblock_idx] = 1; - - // increment tblock_idx - udpdb->tblock_idx+=1; - if (udpdb->tblock_idx==NBLOCKS) - udpdb->tblock_idx = 0; - - // get delay_block - udpdb->nblocks_written++; - behindBlock = udpdb->nblocks_written - delayBlock; - - // deal with counters - uint64_t dropped = udpdb->packets_per_buffer - (udpdb->block_count); - udpdb->packets->received += (udpdb->block_count); - udpdb->bytes->received += (udpdb->block_count) * UDP_DATA; - if (dropped) - { - udpdb->packets->dropped += dropped; - udpdb->bytes->dropped += (dropped * UDP_DATA); - } - dsaX_udpdb_increment(udpdb); - - // write temp queue - for (int i=0; i < udpdb->temp_idx; i++) { - seq_byte = udpdb->temp_seq_byte[i]; - byte_offset = seq_byte - udpdb->block_start_byte; - if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) { - memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, udpdb->temp_buffers + i*UDP_DATA, UDP_DATA); - udpdb->block_count++; - } - } - udpdb->temp_idx = 0; - - } - -} - -// Thread to do writing - -void write_thread(void * arg) { - - dsaX_t * udpdb = (dsaX_t *) arg; - int thread_id = 2; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[1]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - int a, lWriteBlock=0; - while (!quit_threads) { - - // busywait - while (writeBlock[lWriteBlock]==0) - a=1; - - // write block - memcpy(wblock, udpdb->tblock + lWriteBlock*UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK, UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK); - - // get new block - if (dsaX_udpdb_new_buffer (udpdb) < 0) - { - syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); - return EXIT_FAILURE; - } - - // increment counters - writeBlock[lWriteBlock] = 0; - lWriteBlock++; - if (lWriteBlock==NBLOCKS) - lWriteBlock = 0; - delayBlock++; - - } -} - -/* -Thread to run pcap, passing to callback function -*/ - -void pcap_thread(void * arg) { - - dsaX_t * udpdb = (dsaX_t *) arg; - int thread_id = 1;//udpdb->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[0]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - // set up pcap from port CAPTURE_PORT - char dev[] = "eth0"; - pcap_t *handle; - char error_buffer[PCAP_ERRBUF_SIZE]; - struct bpf_program filter; - char filter_exp[] = "port 4011"; - bpf_u_int32 subnet_mask, ip; - - if (pcap_lookupnet(dev, &ip, &subnet_mask, error_buffer) == -1) { - syslog(LOG_ERR,"Could not get information for device: %s", dev); - ip = 0; - subnet_mask = 0; - } - handle = pcap_open_live(dev, 4659, 0, 1, error_buffer); - if (handle == NULL) { - syslog(LOG_ERR,"Could not open %s - %s", dev, error_buffer); - return 2; - } - - if (pcap_compile(handle, &filter, filter_exp, 1, ip) == -1) { - syslog(LOG_ERR,"Bad filter - %s", pcap_geterr(handle)); - return 2; - } - if (pcap_setfilter(handle, &filter) == -1) { - syslog(LOG_ERR,"Error setting filter - %s\n", pcap_geterr(handle)); - return 2; - } - - /* if((pcap_set_buffer_size(handle, 2*1024*1024))!=0) - { - syslog(LOG_ERR, "Could not set buffer size"); - return 2; - }*/ - - - syslog(LOG_INFO,"thread %d: successfully set up pcap",thread_id); - - // start up RX! - while (!quit_threads) - pcap_loop(handle, 0, packet_callback, (u_char*)udpdb); - - // finish - pcap_close(handle); - -} - - - -// MAIN of program - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_capture_pcap", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit for writing */ - dada_hdu_t* hdu_out = 0; - - // input data block HDU key - key_t out_key = CAPTURE_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg=0; - char dada_fnam[200]; // filename for dada header - - while ((arg=getopt(argc,argv,"c:i:f:o:dh")) != -1) - { - switch (arg) - { - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - strcpy(iP,optarg); - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - strcpy(dada_fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // START THREADS - - // start control thread - int rval = 0; - pthread_t control_thread_id; - dsaX_t temp_str; - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str); - if (rval != 0) { - syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT); - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // OPEN CONNECTION TO DADA DB FOR WRITING - - if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - - hdu_out = dada_hdu_create (); - if (DEBUG) syslog(DEBUG,"Created hdu"); - dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); - if (dada_hdu_connect (hdu_out) < 0) { - syslog(LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_out); - syslog(LOG_ERR,"could not lock to output dada buffer"); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"opened connection to output DB"); - - // DEAL WITH DADA HEADER - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - if (DEBUG) syslog(DEBUG,"read header2"); - - if (fileread (dada_fnam, hout, 4096) < 0) - { - free (hout); - syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); - return (EXIT_FAILURE); - } - - - if (DEBUG) syslog(DEBUG,"read header3"); - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // copy the in header to the out header - memcpy (header_out, hout, 4096); - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) - { - syslog(LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - sprintf(STATE,"LISTEN"); - syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); - - - /* time to start up receiver. - */ - - // make recv, write, and stats structs - dsaX_t udpdb[nth]; - dsaX_stats_t stats; - - // shared variables and memory - uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - stats_t * packets = init_stats_t(); - stats_t * bytes = init_stats_t(); - reset_stats_t(packets); - reset_stats_t(bytes); - char * tblock = (char *)malloc(sizeof(char)*NBLOCKS*(ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block))); - char * temp_buffers = (char *)malloc(sizeof(char)*TEMP_MAXY*UDP_DATA); - char * temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*TEMP_MAXY); - - // initialise stats struct - stats.packets = packets; - stats.bytes = bytes; - - for (int i=0;idata_block); - udpdb[i].block_open = 0; - udpdb[i].block_count = 0; - udpdb[i].tblock = tblock; - udpdb[i].tblock_idx = 0; - udpdb[i].temp_buffers = temp_buffers; - udpdb[i].temp_seq_byte = temp_seq_byte; - udpdb[i].temp_idx = 0; - udpdb[i].thread_id = 1; - udpdb[i].verbose = 0; - udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA; - udpdb[i].packets = packets; - udpdb[i].bytes = bytes; - udpdb[i].nblocks_written = 0; - - } - dsaX_udpdb_open_buffer (&udpdb[0]); - - /* start threads */ - - // start the stats thread - pthread_t stats_thread_id; - rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats); - if (rval != 0) { - syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "started stats_thread()"); - - // start the receive threads - pthread_t recv_thread_id[nth]; - rval = 0; - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -/* global variables */ -int quit_threads = 0; -char STATE[20]; -uint64_t UTC_START = 10000; -uint64_t UTC_STOP = 40000000000; -int MONITOR = 0; -char iP[100]; -int DEBUG = 0; -int HISTOGRAM[16]; -int writeBlock = 0; -volatile int doWrite = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - - - -} - -void usage() -{ - fprintf (stdout, - "dsaX_capture [options]\n" - " -c core bind process to CPU core [no default]\n" - " -j IP to listen on for data packets [no default]\n" - " -i IP to listen on for control commands [no default]\n" - " -f filename of template dada header [no default]\n" - " -o out_key [default CAPTURE_BLOCK_KEY]\n" - " -d send debug messages to syslog\n" - " -g chgroup [default 0]\n" - " -h print usage\n"); -} - -/* - * create a socket with the specified number of buffers - */ -dsaX_sock_t * dsaX_init_sock () -{ - dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t)); - assert(b != NULL); - - b->bufsz = sizeof(char) * UDP_PAYLOAD; - - b->buf = (char *) malloc (b->bufsz); - assert(b->buf != NULL); - - b->have_packet = 0; - b->fd = 0; - - return b; -} - -void dsaX_free_sock(dsaX_sock_t* b) -{ - b->fd = 0; - b->bufsz = 0; - b->have_packet =0; - if (b->buf) - free (b->buf); - b->buf = 0; -} - -/* - * intialize UDP receiver resources - */ -int dsaX_udpdb_init_receiver (udpdb_t * ctx) -{ - syslog(LOG_INFO,"dsax_udpdb_init_receiver()"); - - // create a dsaX socket which can hold variable num of UDP packet - ctx->sock = dsaX_init_sock(); - - ctx->ooo_packets = 0; - ctx->recv_core = -1; - ctx->n_sleeps = 0; - ctx->mb_rcv_ps = 0; - ctx->mb_drp_ps = 0; - ctx->block_open = 0; - ctx->block_count = 0; - ctx->capture_started = 0; - ctx->last_seq = 0; - ctx->last_byte = 0; - ctx->block_start_byte = 0; - - // allocate required memory strucutres - ctx->packets = init_stats_t(); - ctx->bytes = init_stats_t(); - return 0; -} - -/* -prepare socket and writer -*/ - -int dsaX_udpdb_prepare (udpdb_t * ctx) -{ - syslog(LOG_INFO, "dsaX_udpdb_prepare()"); - - // open socket - syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port); - ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose); - if (ctx->sock->fd < 0) { - syslog (LOG_ERR, "Error, Failed to create udp socket"); - return -1; - } - - - // set the socket size to 256 MB - int sock_buf_size = 256*1024*1024; - syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size); - dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size); - - // set the socket to non-blocking - syslog(LOG_INFO, "prepare: setting non_block"); - sock_nonblock(ctx->sock->fd); - - // clear any packets buffered by the kernel - syslog(LOG_INFO, "prepare: clearing packets at socket"); - size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD); - - // setup the next_seq to the initial value - //ctx->last_seq = 0; - //ctx->last_byte = 0; - //ctx->n_sleeps = 0; - - return 0; -} - -/* - * reset receiver before an observation commences - */ -void dsaX_udpdb_reset_receiver (udpdb_t * ctx) -{ - syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()"); - - ctx->capture_started = 0; - ctx->last_seq = 0; - ctx->last_byte = 0; - ctx->n_sleeps = 0; - - reset_stats_t(ctx->packets); - reset_stats_t(ctx->bytes); -} - -/* - * open a data block buffer ready for direct access - */ -int dsaX_udpdb_open_buffer (udpdb_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()"); - - if (ctx->block_open) - { - syslog (LOG_ERR, "open_buffer: buffer already opened"); - return -1; - } - - if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write"); - - uint64_t block_id = 0; - - ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id); - if (!ctx->block) - { - syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed"); - return -1; - } - - ctx->block_open = 1; - - return 0; -} - -/* - * close a data buffer, assuming a full block has been written - */ -int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod); - - if (!ctx->block_open) - { - syslog (LOG_ERR, "close_buffer: buffer already closed"); - return -1; - } - - // log any buffers that are not full, except for the 1 byte "EOD" buffer - if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz)) - syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: " - "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", - bytes_written, ctx->hdu_bufsz); - - if (eod) - { - if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed"); - return -1; - } - } - else - { - if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0) - { - syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed"); - return -1; - } - } - - ctx->block = 0; - ctx->block_open = 0; - - return 0; -} - -// increment counters when block is full -int dsaX_udpdb_increment (udpdb_t * ctx) -{ - - // increment buffer byte markers - ctx->block_start_byte = ctx->block_end_byte + UDP_DATA; - ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA; - ctx->block_count = 0; - if (writeBlock==0) writeBlock=1; - else writeBlock=0; - -} - -/* - * move to the next ring buffer element. return pointer to base address of new buffer - */ -int dsaX_udpdb_new_buffer (udpdb_t * ctx) -{ - - if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()"); - - if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed"); - return -1; - } - - if (dsaX_udpdb_open_buffer (ctx) < 0) - { - syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed"); - return -1; - } - - - // set block to 0 - //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte); - - if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", - ctx->block_start_byte, ctx->block_end_byte); - - return 0; - -} - -/* - * destroy UDP receiver resources - */ -int dsaX_udpdb_destroy_receiver (udpdb_t * ctx) -{ - if (ctx->sock) - dsaX_free_sock(ctx->sock); - ctx->sock = 0; -} - -/* - * Close the udp socket and file - */ - -int udpdb_stop_function (udpdb_t* ctx) -{ - - syslog(LOG_INFO, "stop: dada_hdu_unlock_write()"); - if (dada_hdu_unlock_write (ctx->hdu) < 0) - { - syslog (LOG_ERR, "stop: could not unlock write on"); - return -1; - } - - // close the UDP socket - close(ctx->sock->fd); - - if (ctx->packets->dropped) - { - double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte; - percent *= 100; - - syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %", - ctx->bytes->dropped, ctx->last_byte, percent); - } - - return 0; -} - - - - -/* --------- THREADS -------- */ - -// STATS THREAD - -/* - * Thread to print simple capture statistics - */ -void stats_thread(void * arg) { - - /* // set affinity - const pthread_t pid = pthread_self(); - const int core_id = 4; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - */ - - udpdb_t * ctx = (udpdb_t *) arg; - uint64_t b_rcv_total = 0; - uint64_t b_rcv_1sec = 0; - uint64_t b_rcv_curr = 0; - - uint64_t b_drp_total = 0; - uint64_t b_drp_1sec = 0; - uint64_t b_drp_curr = 0; - - uint64_t s_rcv_total = 0; - uint64_t s_rcv_1sec = 0; - uint64_t s_rcv_curr = 0; - - uint64_t ooo_pkts = 0; - float gb_rcv_ps = 0; - float mb_rcv_ps = 0; - float mb_drp_ps = 0; - - syslog(LOG_INFO,"starting stats thread..."); - sleep(2); - syslog(LOG_INFO,"started stats thread..."); - - while (!quit_threads) - { - - /* get a snapshot of the data as quickly as possible */ - b_rcv_curr = ctx->bytes->received; - b_drp_curr = ctx->bytes->dropped; - s_rcv_curr = ctx->n_sleeps; - - /* calc the values for the last second */ - b_rcv_1sec = b_rcv_curr - b_rcv_total; - b_drp_1sec = b_drp_curr - b_drp_total; - s_rcv_1sec = s_rcv_curr - s_rcv_total; - - /* update the totals */ - b_rcv_total = b_rcv_curr; - b_drp_total = b_drp_curr; - s_rcv_total = s_rcv_curr; - - mb_rcv_ps = (double) b_rcv_1sec / 1000000; - mb_drp_ps = (double) b_drp_1sec / 1000000; - gb_rcv_ps = b_rcv_1sec * 8; - gb_rcv_ps /= 1000000000; - - /* determine how much memory is free in the receivers */ - syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64"", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq); - - sleep(1); - } - -} - - - - - - - -// CONTROL THREAD - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = CAPTURE_CONTROL_PORT; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - - // INTERPRET BUFFER STRING - // receive either UTC_START, UTC_STOP, MONITOR - - // interpret buffer string - char * rest = buffer; - char *cmd, *val; - cmd = strtok_r(rest, "-", &rest); - val = strtok_r(rest, "-", &rest); - syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val); - - if (strcmp(cmd,"UTC_START")==0) - UTC_START = strtoull(val,&endptr,0); - - if (strcmp(cmd,"UTC_STOP")==0) - UTC_STOP = strtoull(val,&endptr,0); - - close(fd); - - } - - free (buffer); - - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -/* - * Thread to capture data - */ -int recv_thread(void * arg) { - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = 34; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - - udpdb_t * udpdb = (udpdb_t *) arg; - - /* START WHAT WAS in RECV THREAD */ - - // DEFINITIONS - - uint64_t act_seq_no = 0; - uint64_t block_seq_no = 0; - uint64_t seq_no = 0; - uint64_t ch_id = 0; - uint64_t ant_id = 0; - unsigned char * b = (unsigned char *) udpdb->sock->buf; - size_t got = 0; // data received from a recv_from call - int errsv; // determine the sequence number boundaries for curr and next buffers - int64_t byte_offset = 0; // offset of current packet in bytes from start of block - uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs - // for "saving" out of order packets near edges of blocks - unsigned int temp_idx = 0; - unsigned int temp_max = 1000; - char ** temp_buffers; //[temp_max][UDP_DATA]; - uint64_t * temp_seq_byte; - temp_buffers = (char **)malloc(sizeof(char *)*temp_max); - for (int i=0;isock->have_packet = 0; - - // incredibly tight loop to try and get a packet - while (!udpdb->sock->have_packet) - { - - // receive 1 packet into the socket buffer - got = recvfrom ( udpdb->sock->fd, udpdb->sock->buf, UDP_PAYLOAD, 0, NULL, NULL ); - - if (got == UDP_PAYLOAD) - { - udpdb->sock->have_packet = 1; - } - else if (got == -1) - { - errsv = errno; - if (errsv == EAGAIN) - { - udpdb->n_sleeps++; - if (udpdb->capture_started) - timeouts++; - if (timeouts > timeout_max) - syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max); - } - else - { - syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv)); - return EXIT_FAILURE; - } - } - else // we received a packet of the WRONG size, ignore it - { - syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD); - } - } - timeouts = 0; - - // we have a valid packet within the timeout - if (udpdb->sock->have_packet) - { - - // decode packet header (64 bits) - // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet) - seq_no = 0; - seq_no |= (((uint64_t)(udpdb->sock->buf[4]) & 224) >> 5) & 7; - seq_no |= (((uint64_t)(udpdb->sock->buf[3])) << 3) & 2040; - seq_no |= (((uint64_t)(udpdb->sock->buf[2])) << 11) & 522240; - seq_no |= (((uint64_t)(udpdb->sock->buf[1])) << 19) & 133693440; - seq_no |= (((uint64_t)(udpdb->sock->buf[0])) << 27) & 34225520640; - ant_id = 0; - ant_id |= (unsigned char) (udpdb->sock->buf[6]) << 8; - ant_id |= (unsigned char) (udpdb->sock->buf[7]); - - act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no - block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block - - // check for starting or stopping condition, using continue - if (canWrite==0) { - if (seq_no >= UTC_START-50 && UTC_START != 10000) ct_snaps++; - if (ct_snaps >= 10) canWrite=1; - } - udpdb->last_seq = seq_no; - if (canWrite == 0) continue; - - // if first packet - if (!udpdb->capture_started) - { - udpdb->block_start_byte = block_seq_no * UDP_DATA; - udpdb->block_end_byte = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA; - udpdb->capture_started = 1; - - syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte); - } - - // if capture running - if (udpdb->capture_started) - { - seq_byte = (act_seq_no * UDP_DATA); - - udpdb->last_byte = seq_byte; - - // if packet arrived too late, ignore - if (seq_byte < udpdb->block_start_byte) - { - udpdb->packets->dropped++; - udpdb->bytes->dropped += UDP_DATA; - } - else - { - // packet belongs in this block - if (seq_byte <= udpdb->block_end_byte) - { - byte_offset = seq_byte - udpdb->block_start_byte; - memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, udpdb->sock->buf + UDP_HEADER, UDP_DATA); - udpdb->packets->received++; - udpdb->bytes->received += UDP_DATA; - udpdb->block_count++; - } - // packet belongs in subsequent block - else - { - - if (temp_idx < temp_max) - { - // save packet to temp buffer - memcpy (temp_buffers[temp_idx], udpdb->sock->buf + UDP_HEADER, UDP_DATA); - temp_seq_byte[temp_idx] = seq_byte; - temp_idx++; - } - else - { - udpdb->packets->dropped++; - udpdb->bytes->dropped += UDP_DATA; - } - } - } - } - - // now check for a full buffer or full temp queue - if ((udpdb->block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max)) - { - syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", " - "ant_id=%"PRIu16", block_count=%"PRIu64", " - "temp_idx=%d\n", seq_no, ant_id, udpdb->block_count, - temp_idx); - - // write block - doWrite=1; - - uint64_t dropped = udpdb->packets_per_buffer - udpdb->block_count; - if (dropped) - { - udpdb->packets->dropped += dropped; - udpdb->bytes->dropped += (dropped * UDP_DATA); - } - - // increment counters - dsaX_udpdb_increment(udpdb); - - // write any temp packets saved - - if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb->block_start_byte, udpdb->block_end_byte); - - // include any futuristic packets we saved - for (i=0; i < temp_idx; i++) - { - seq_byte = temp_seq_byte[i]; - byte_offset = seq_byte - udpdb->block_start_byte; - if (byte_offset < udpdb->hdu_bufsz) - { - memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA); - udpdb->block_count++; - udpdb->packets->received++; - udpdb->bytes->received += UDP_DATA; - } - else - { - udpdb->packets->dropped++; - udpdb->bytes->dropped += UDP_DATA; - } - } - temp_idx = 0; - } - - } - - // packet has been inserted or saved by this point - udpdb->sock->have_packet = 0; - - - } - - - free(temp_buffers); - free(temp_seq_byte); - -} - -/* - * Thread to write data - */ -int write_thread(void * arg) { - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = 36; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id); - if (CPU_ISSET(core_id, &cpuset)) - syslog(LOG_INFO,"thread %d: successfully set thread",core_id); - - - udpdb_t * udpdb = (udpdb_t *) arg; - int lWriteBlock = 0; - int a; - - while (!quit_threads) - { - - while (!doWrite) { - a=1; - } - - syslog(LOG_INFO,"writing block..."); - - memcpy(udpdb->block, udpdb->tblock + lWriteBlock*udpdb->hdu_bufsz, udpdb->hdu_bufsz); - - if (dsaX_udpdb_new_buffer (udpdb) < 0) - { - syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed"); - return EXIT_FAILURE; - } - - doWrite=0; - if (lWriteBlock==0) lWriteBlock=1; - else lWriteBlock=0; - - } - -} - - - -// MAIN of program - -int main (int argc, char *argv[]) { - - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_capture_thread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit for writing */ - dada_hdu_t* hdu_out = 0; - - /* actual struct with info */ - udpdb_t udpdb; - - // input data block HDU key - key_t out_key = CAPTURE_BLOCK_KEY; - - // command line arguments - int core = -1; - int chgroup = 0; - int arg=0; - char dada_fnam[200]; // filename for dada header - char iface[100]; // IP for data packets - - while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1) - { - switch (arg) - { - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - strcpy(iP,optarg); - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'g': - if (optarg) - { - chgroup = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-g flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'j': - if (optarg) - { - strcpy(iface,optarg); - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - strcpy(dada_fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // record STATE info - sprintf(STATE,"NOBUFFER"); - - // START THREADS - - // start control thread - int rval = 0; - pthread_t control_thread_id, stats_thread_id; - if (DEBUG) - syslog (LOG_DEBUG, "Creating threads"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT); - - // start the stats thread - rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "started stats_thread()"); - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // initialize the data structure - syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()"); - if (dsaX_udpdb_init_receiver (&udpdb) < 0) - { - syslog (LOG_ERR, "could not initialize receiver"); - return EXIT_FAILURE; - } - - - // OPEN CONNECTION TO DADA DB FOR WRITING - - if (DEBUG) syslog(LOG_DEBUG,"Creating HDU"); - - hdu_out = dada_hdu_create (0); - if (DEBUG) syslog(DEBUG,"Created hdu"); - dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY); - if (dada_hdu_connect (hdu_out) < 0) { - syslog(LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (DEBUG) syslog(LOG_DEBUG,"Connected HDU"); - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_out); - syslog(LOG_ERR,"could not lock to output dada buffer"); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"opened connection to output DB"); - - // DEAL WITH DADA HEADER - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - if (DEBUG) syslog(DEBUG,"read header2"); - - if (fileread (dada_fnam, hout, 4096) < 0) - { - free (hout); - syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam); - return (EXIT_FAILURE); - } - - - if (DEBUG) syslog(DEBUG,"read header3"); - - - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - - - // copy the in header to the out header - memcpy (header_out, hout, 4096); - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0) - { - syslog(LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - sprintf(STATE,"LISTEN"); - syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state"); - - - /* time to start up receiver. - data are captured on iface:CAPTURE_PORT - */ - - printf("here\n"); - - - // put information in udpdb struct - udpdb.hdu = hdu_out; - udpdb.port = CAPTURE_PORT; - udpdb.interface = strdup(iface); - udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - char * tblock = (char *)malloc(sizeof(char)*udpdb.hdu_bufsz); - udpdb.tblock = tblock; - // determine number of packets per block, must - if (udpdb.hdu_bufsz % UDP_DATA != 0) - { - syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA); - return EXIT_FAILURE; - } - udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA; - udpdb.bytes_to_acquire = 0; - udpdb.num_inputs = NSNAPS; - - // prepare the socket - syslog(LOG_INFO, "main: dsaX_udpdb_prepare()"); - if (dsaX_udpdb_prepare (&udpdb) < 0) - { - syslog(LOG_ERR, "could allocate required resources (prepare)"); - return EXIT_FAILURE; - } - - // reset the receiver - syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()"); - dsaX_udpdb_reset_receiver (&udpdb); - - // open a block of the data block, ready for writing - if (dsaX_udpdb_open_buffer (&udpdb) < 0) - { - syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed"); - return -1; - } - - - // start threads - - // start recv thread - rval = 0; - pthread_t recv_thread_id, write_thread_id; - rval = pthread_create (&recv_thread_id, 0, (void *) recv_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_ERR, "Error creating recv_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "Created recv thread"); - - // start the write thread - rval = pthread_create (&write_thread_id, 0, (void *) write_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval)); - return -1; - } - syslog(LOG_NOTICE, "started write_thread()"); - - while (!quit_threads) { - sleep(1); - } - - // close threads - syslog(LOG_INFO, "joining all threads"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - pthread_join (stats_thread_id, &result); - pthread_join (recv_thread_id, &result); - pthread_join (write_thread_id, &result); - - free(tblock); - - dsaX_dbgpu_cleanup (hdu_out); - -} diff --git a/src/dsaX_copydb.c b/src/dsaX_copydb.c deleted file mode 100644 index 7714038..0000000 --- a/src/dsaX_copydb.c +++ /dev/null @@ -1,273 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block; - uint64_t written, block_id; - - - // set up - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - written = ipcio_write (hdu_out->data_block, block, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_cuda_correlator.cu b/src/dsaX_cuda_correlator.cu deleted file mode 100644 index 3bebd09..0000000 --- a/src/dsaX_cuda_correlator.cu +++ /dev/null @@ -1,309 +0,0 @@ -// -*- c++ -*- -/* will run xgpu */ -/* assumes input block size is appropriate */ -#define THRUST_IGNORE_CUB_VERSION_CHECK - -#include -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -//#include "cube/cube.h" -#include "xgpu.h" - - -#ifdef __MACH__ -#include -#define CLOCK_REALTIME 0 -#define CLOCK_MONOTONIC 0 -int clock_gettime(int clk_id, struct timespec *t){ - mach_timebase_info_data_t timebase; - mach_timebase_info(&timebase); - uint64_t time; - time = mach_absolute_time(); - double nseconds = ((double)time * (double)timebase.numer)/((double)timebase.denom); - double seconds = ((double)time * (double)timebase.numer)/((double)timebase.denom * 1e9); - t->tv_sec = seconds; - t->tv_nsec = nseconds; - return 0; -} -#else -#include -#endif - -/* - Data ordering for input vectors is (running from slowest to fastest) - [time][channel][station][polarization][complexity] - - Output matrix has ordering - [channel][station][station][polarization][polarization][complexity] -*/ - -int main(int argc, char** argv) { - - int opt; - int i, j; - int device = 0; - unsigned int seed = 1; - int outer_count = 1; - int count = 1; - int syncOp = SYNCOP_SYNC_TRANSFER; - int finalSyncOp = SYNCOP_DUMP; - int verbose = 0; - int hostAlloc = 0; - XGPUInfo xgpu_info; - unsigned int npol, nstation, nfrequency; - int xgpu_error = 0; - Complex *omp_matrix_h = NULL; - struct timespec outer_start, start, stop, outer_stop; - double total, per_call, max_bw, gbps; -#ifdef RUNTIME_STATS - struct timespec tic, toc; -#endif - - while ((opt = getopt(argc, argv, "C:c:d:f:ho:rs:v:")) != -1) { - switch (opt) { - case 'c': - // Set number of time to call xgpuCudaXengine - count = strtoul(optarg, NULL, 0); - if(count < 1) { - fprintf(stderr, "count must be positive\n"); - return 1; - } - break; - case 'C': - // Set number of time to call xgpuCudaXengine - outer_count = strtoul(optarg, NULL, 0); - if(outer_count < 1) { - fprintf(stderr, "outer count must be positive\n"); - return 1; - } - break; - case 'd': - // Set CUDA device number - device = strtoul(optarg, NULL, 0); - break; - case 'f': - // Set syncOp for final call - finalSyncOp = strtoul(optarg, NULL, 0); - break; - case 'o': - // Set syncOp - syncOp = strtoul(optarg, NULL, 0); - break; - case 'r': - // Register host allocated memory - hostAlloc = 1; - break; - case 's': - // Set seed for random data - seed = strtoul(optarg, NULL, 0); - break; - case 'v': - // Set verbosity level - verbose = strtoul(optarg, NULL, 0); - break; - default: /* '?' */ - fprintf(stderr, - "Usage: %s [options]\n" - "Options:\n" - " -c INTEG_CALLS Calls to xgpuCudaXengine per integration [1]\n" - " -C INTEG_COUNT Number of integrations [1]\n" - " -d DEVNUM GPU device to use [0]\n" - " -f FINAL_SYNCOP Sync operation for final call [1]\n" - " -o SYNCOP Sync operation for all but final call [1]\n" - " Sync operation values are:\n" - " 0 (no sync)\n" - " 1 (sync and dump)\n" - " 2 (sync host to device transfer)\n" - " 3 (sync kernel computations)\n" - " -r Register host allocated memory [false]\n" - " (otherwise use CUDA allocated memory)\n" - " -s SEED Random number seed [1]\n" - " -v {0|1|2|3} Verbosity level (debug only) [0]\n" - " -h Show this message\n", - argv[0]); - exit(EXIT_FAILURE); - } - } - - srand(seed); - - // Get sizing info from library - xgpuInfo(&xgpu_info); - npol = xgpu_info.npol; - nstation = xgpu_info.nstation; - nfrequency = xgpu_info.nfrequency; - - printf("Correlating %u stations with %u channels and integration length %u\n", - xgpu_info.nstation, xgpu_info.nfrequency, xgpu_info.ntime); -#ifndef FIXED_POINT - printf("Sending floating point data to GPU.\n"); -#else - printf("Sending fixed point data to GPU.\n"); -#endif - - // perform host memory allocation - - // allocate the GPU X-engine memory - XGPUContext context; - context.array_len = xgpu_info.vecLength; - context.matrix_len = xgpu_info.matLength; - context.array_h = NULL; - context.matrix_h = NULL; - - xgpu_error = xgpuInit(&context, device); - - ComplexInput *array_h = context.array_h; // this is pinned memory - Complex *cuda_matrix_h = context.matrix_h; - - // create an array of complex noise - xgpuRandomComplex(array_h, xgpu_info.vecLength); - - xgpuSwizzleInput(context.array_h, array_h); - - // try copying to GPU - ComplexInput *array_hd; - cudaMalloc((void **)&array_hd, context.array_len*sizeof(ComplexInput)); - cudaMemcpy(array_hd,context.array_h,context.array_len*sizeof(ComplexInput),cudaMemcpyHostToDevice); - - // ompXengine always uses TRIANGULAR_ORDER - unsigned int ompMatLength = nfrequency * ((nstation+1)*(nstation/2)*npol*npol); - omp_matrix_h = (Complex *) malloc(ompMatLength*sizeof(Complex)); - if(!omp_matrix_h) { - fprintf(stderr, "error allocating output buffer for xgpuOmpXengine\n"); - goto cleanup; - } - -#if (CUBE_MODE == CUBE_DEFAULT && !defined(POWER_LOOP) ) - // Only call CPU X engine if dumping GPU X engine exactly once - if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) { - printf("Calling CPU X-Engine\n"); - xgpuOmpXengine(omp_matrix_h, array_h); - } -#endif - -#define ELAPSED_MS(start,stop) \ - ((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6) - - printf("Calling GPU X-Engine\n"); - clock_gettime(CLOCK_MONOTONIC, &outer_start); - for(j=0; j 1) { - clock_gettime(CLOCK_MONOTONIC, &outer_stop); - total = ELAPSED_MS(outer_start,outer_stop); - per_call = total/(count*outer_count); - // per_spectrum = per_call / NTIME - // per_channel = per_spectrum / NFREQUENCY - // = per_call / (NTIME * NFREQUENCY) - // max_bw (kHz) = 1 / per_channel = (NTIME * NFREQUENCY) / per_call - max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz - gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count * outer_count)) / total / 1e6; // Gbps - printf("Elapsed time %.6f ms total, %.6f ms/call average\n", - total, per_call); - printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n", - max_bw, gbps); - } - -#if (CUBE_MODE == CUBE_DEFAULT) - - // Only compare CPU and GPU X engines if dumping GPU X engine exactly once - if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) { - xgpuReorderMatrix(cuda_matrix_h); - xgpuCheckResult(cuda_matrix_h, omp_matrix_h, verbose, array_h); - } - -#if 0 - int fullMatLength = nfrequency * nstation*nstation*npol*npol; - Complex *full_matrix_h = (Complex *) malloc(fullMatLength*sizeof(Complex)); - - // convert from packed triangular to full matrix - xgpuExtractMatrix(full_matrix_h, cuda_matrix_h); - - free(full_matrix_h); -#endif -#endif - -cleanup: - //free host memory - free(omp_matrix_h); - - // free gpu memory - xgpuFree(&context); - cudaFree(array_hd); - -#ifdef DP4A - free(array_h); -#endif - - /* if(hostAlloc) { - free(context.array_h); - free(context.matrix_h); - }*/ - - return xgpu_error; -} diff --git a/src/dsaX_dbnic.c b/src/dsaX_dbnic.c deleted file mode 100644 index 83e3e4a..0000000 --- a/src/dsaX_dbnic.c +++ /dev/null @@ -1,435 +0,0 @@ -/* simple nicdb - -will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now - -*/ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - - -// data to pass to threads -struct data { - char * out; - int sockfd; - struct sockaddr_in si_other; - int thread_id; - int chgroup; - int tseq; -}; - -/* global variables */ -int DEBUG = 0; -int TEST = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_dbnic [options]\n" - " -c core bind process to CPU core [no default]\n" - " -g chgroup [default 0]\n" - " -d send debug messages to syslog\n" - " -t TEST\n" - " -i in_key [default BF_BLOCK_KEY]\n" - " -w -x -y -z four ip addresses for corner turn\n" - " -h print usage\n"); -} - -/* thread for data transmission */ -void * transmit(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int sockfd = d->sockfd; - struct sockaddr_in si_other = d->si_other; - char * output = (char *)(d->out); - int chgroup = d->chgroup; - int tseq = d->tseq; - char * packet = (char *)malloc(sizeof(char)*P_SIZE); - int * ipacket = (int *)(packet); - - - // for test packet - if (tseq==-1) { - - ipacket[0] = chgroup; - sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other)); - - } - else { - - // fill op, doing transpose - char * op = (char *)malloc(sizeof(char)*(NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); - //iop[0] = chgroup; - //iop[1] = tseq; - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size); - uint64_t bytes_read = 0; - char *block; - uint64_t written, block_id; - - - // set up - int observation_complete=0; - int blocks = 0; - int started = 0; - int nthreads = NBMS / NBEAMS_PER_BLOCK; - - - // create socket connections - int sockfd[nthreads]; - struct sockaddr_in servaddr[nthreads]; - - for (int i=0;idata_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); - - // put together args - for (int i=0; idata_block, bytes_read); - - } - - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - - -// data to pass to threads -struct data { - char * out; - int sockfd; - int thread_id; - int chgroup; - int tseq; -}; - -/* global variables */ -int DEBUG = 0; -int TEST = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_dbnic [options]\n" - " -c core bind process to CPU core [no default]\n" - " -g chgroup [default 0]\n" - " -d send debug messages to syslog\n" - " -t TEST\n" - " -i in_key [default BF_BLOCK_KEY]\n" - " -w -x -y -z four ip addresses for corner turn\n" - " -h print usage\n"); -} - -/* thread for data transmission */ -void * transmit(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int sockfd = d->sockfd; - char * output = (char *)(d->out); - char * op = (char *)malloc(sizeof(char)*(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); - int * iop = (int *)(op); - int chgroup = d->chgroup; - int tseq = d->tseq; - - // fill op, doing transpose - iop[0] = chgroup; - iop[1] = tseq; - for (int i=0;i0) && (remain_data > 0)) { - remain_data -= sbytes; - sent_bytes += sbytes; - }*/ - sbytes = send(sockfd, op, remain_data, 0); - if (sbytes= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size); - uint64_t bytes_read = 0; - char *block; - uint64_t written, block_id; - - - // set up - int observation_complete=0; - int blocks = 0; - int started = 0; - int nthreads = NBMS / NBEAMS_PER_BLOCK; - - - // create socket connections - int sockfd[nthreads]; - struct sockaddr_in servaddr; - for (int i=0;idata_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); - - // put together args - for (int i=0; idata_block, bytes_read); - - } - - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -f file to read packet from [default none]\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - useZ = 0; - strcpy(fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - uint64_t npackets = block_out / 4608; - char * block, * output_buffer; - char * packet; - packet = (char *)malloc(sizeof(char)*4608); - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // fill output buffer if file exists - FILE *fin; - if (!useZ) { - - if (!(fin=fopen(fnam,"rb"))) { - syslog(LOG_ERR, "cannot open file - will write zeros"); - } - else { - - fread(packet,4608,1,fin); - fclose(fin); - - syslog(LOG_INFO,"Read packet, npackets %lu",npackets); - - for (int i=0;idata_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - // no need to do anything here - output_buffer is ready to go - - // write to output - written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(packet); - free(output_buffer); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_filTrigger.c b/src/dsaX_filTrigger.c deleted file mode 100644 index 55f95fd..0000000 --- a/src/dsaX_filTrigger.c +++ /dev/null @@ -1,559 +0,0 @@ -/* Code to read from a single dada buffer, and write to disk upon receiving -a trigger. Uses pthread threads and shared memory to listen. -Sequence of events: - - starts null-reading dump buffer, while listening for socket command - + for N second dump, assume N-second dada blocks - - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. - - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dsaX_capture.h" -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" - -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -uint64_t specnum = 0; -uint64_t next_specnum = 0; -uint64_t procnum = 0; -int trignum = 0; -int dumpnum = 0; -char iP[100]; -char footer_buf[1024]; -char next_footer_buf[1024]; -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); -int dada_bind_thread_to_core (int core); - -FILE *output; - -void send_string(char *string) /* includefile */ -{ - int len; - len=strlen(string); - fwrite(&len, sizeof(int), 1, output); - fwrite(string, sizeof(char), len, output); -} - -void send_float(char *name,float floating_point) /* includefile */ -{ - send_string(name); - fwrite(&floating_point,sizeof(float),1,output); -} - -void send_double (char *name, double double_precision) /* includefile */ -{ - send_string(name); - fwrite(&double_precision,sizeof(double),1,output); -} - -void send_int(char *name, int integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(int),1,output); -} - -void send_char(char *name, char integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(char),1,output); -} - - -void send_long(char *name, long integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(long),1,output); -} - -void send_coords(double raj, double dej, double az, double za) /*includefile*/ -{ - if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); - if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); - if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); - if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); -} - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_filTrigger [options]\n" - " -c core bind process to CPU core\n" - " -i IP to listen to [no default]\n" - " -j in_key [default eaea]\n" - " -d debug\n" - " -n output file name base [no default]\n" - " -b beam number of first beam [default 0]\n" - " -z respond to zero specnum\n" - " -h print usage\n"); -} - - -// Thread to control the dumping of data - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = ctx->control_port; - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - char* tbuf = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,"11227",&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - strcpy(tbuf,buffer); - trignum++; - - // interpret buffer string - char * rest = buffer; - char tnam[100]; - tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0)); - strcpy(tnam,strtok_r(rest, "-", &rest)); - - if (!dump_pending) { - //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); - specnum = tmps/4; - strcpy(footer_buf,tnam); - syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf); - } - - if (dump_pending) { - syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam); - next_specnum = tmps/4; - strcpy(next_footer_buf,tnam); - } - - if (!dump_pending) dump_pending = 1; - - close(fd); - - } - - free (buffer); - free (tbuf); - - if (ctx->verbose) - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_filTrigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - - /* port for control commands */ - int control_port = TRIGGER_CONTROL_PORT; - - /* actual struct with info */ - udpdb_t udpdb; - - // input data block HDU key - key_t in_key = 0x0000eaea; - - // command line arguments - int core = -1; - int beamn = 0; - char of[200]; - char foutnam[300]; - char dirnam[300]; - int rz=0; - int arg=0; - - while ((arg=getopt(argc,argv,"i:c:j:db:n:hz")) != -1) - { - switch (arg) - { - case 'i': - strcpy(iP,optarg); - break; - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -c flag requires argument\n"); - return EXIT_FAILURE; - } - case 'b': - if (optarg) - { - beamn = atoi(optarg); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -b flag requires argument\n"); - return EXIT_FAILURE; - } - case 'n': - if (optarg) - { - strcpy(of,optarg); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -n flag requires argument\n"); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_INFO, "Will excrete all debug messages"); - break; - case 'z': - rz=1; - syslog (LOG_INFO, "Will respond to zero trigger"); - break; - case 'j': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // DADA stuff - - udpdb.verbose = DEBUG; - udpdb.control_port = control_port; - - // start control thread - int rval = 0; - pthread_t control_thread_id; - syslog(LOG_INFO, "starting control_thread()"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - - - syslog (LOG_INFO, "creating hdus"); - - // open connection to the in/read DBs - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer"); - return EXIT_FAILURE; - } - - // Bind to cpu core - if (core >= 0) - { - syslog(LOG_INFO,"binding to core %d", core); - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - } - - int observation_complete=0; - - // more DADA stuff - deal with headers - - uint64_t header_size = 0; - - // read the header from the input HDU - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "main: could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - // mark the input header as cleared - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared [input]"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - - // stuff for writing data - /* - Data will have [64 beam, time, freq] for each block. - Need to extract - */ - - - - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - unsigned char * extData = (unsigned char *)malloc(sizeof(unsigned char)*NSAMPS_PER_BLOCK*NCHAN_FIL*NBEAMS_PER_BLOCK); - uint64_t specs_per_block = NSAMPS_PER_BLOCK; - uint64_t current_specnum = 0; // updates with each dada block read - uint64_t start_byte, bytes_to_copy, bytes_copied=0; - char * in_data; - uint64_t written=0; - uint64_t block_id, bytes_read=0; - int dumping = 0; - FILE *ofile; - ofile = fopen("/home/ubuntu/data/dumps.dat","a"); - fprintf(ofile,"starting...\n"); - fclose(ofile); - - - // main reading loop - float pc_full = 0.; - - syslog(LOG_INFO, "main: starting observation"); - - while (!observation_complete) { - - // read a DADA block - in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // add delay - // only proceed if input data block is 80% full - while (pc_full < 0.8) { - pc_full = ipcio_percent_full(hdu_in->data_block); - usleep(100); - } - pc_full = 0.; - - - // check for dump_pending - if (dump_pending) { - - // look after hand trigger - if (specnum==0 && rz==1) { - - specnum = current_specnum + 40000; - - } - - // if this is the first block to dump - if (specnum > current_specnum && specnum < current_specnum+specs_per_block) { - - dumping = 1; - syslog(LOG_INFO,"dumping is 1 -- first block"); - - // loop over beams - bytes_to_copy = (NSAMPS_PER_BLOCK-(specnum-current_specnum))*NCHAN_FIL; - bytes_copied = bytes_to_copy; - for (int i=0;i current_specnum && specnum + NSAMPS_PER_BLOCK <= current_specnum + specs_per_block && dumping==1) { - - syslog(LOG_INFO,"in second block"); - - // loop over beams - bytes_to_copy = NSAMPS_PER_BLOCK*NCHAN_FIL-bytes_copied; - for (int i=0;idata_block, bytes_read); - - - } - - - // close control thread - syslog(LOG_INFO, "joining control_thread"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - - free(extData); - dsaX_dbgpu_cleanup (hdu_in); - -} diff --git a/src/dsaX_fluff.c b/src/dsaX_fluff.c deleted file mode 100644 index 3e3f2d1..0000000 --- a/src/dsaX_fluff.c +++ /dev/null @@ -1,415 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -// data to pass to threads -struct data { - char * in; - char * out; - int n_threads; - int thread_id; - int debug; -}; - -/* global variables */ -int DEBUG = 0; -int cores[8] = {22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int dbg = d->debug; - int na = 64; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - char *out = (char *)d->out; - int nthreads = d->n_threads; - - // local array - int * fluffed_int = (int *)(in); - int * out_int = (int *)(out); - - // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose - int tile_size = 4; // set by benchmarking - for (int i_packet=NPACKETS*thread_id/nthreads;i_packet= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * blockie; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // set up data structure - for (int i=0; idata_block, &block_id); - memcpy(blockie, output_buffer, block_out); - ipcio_close_block_write(hdu_out->data_block, block_out); - - //written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - -} - - diff --git a/src/dsaX_makeFil.c b/src/dsaX_makeFil.c deleted file mode 100644 index e9d6e3c..0000000 --- a/src/dsaX_makeFil.c +++ /dev/null @@ -1,276 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block; - uint64_t written, block_id; - - - // set up - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - // here is where we convert input voltage data to output filterbank data - - - // write to output dada block - written = ipcio_write (hdu_out->data_block, block, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_merge.c b/src/dsaX_merge.c deleted file mode 100644 index 7866d5f..0000000 --- a/src/dsaX_merge.c +++ /dev/null @@ -1,580 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -/* global variables */ -int DEBUG = 0; -int STATS = 0; -const int nth = 4; - -// data to pass to threads -struct data { - char * in; - char * in2; - char * out; - int * ant_order1; - int * ant_order2; - int n_threads; - int thread_id; -}; -int cores[4] = {17, 18, 37, 38}; - - -void * massage (void *args) { - - struct data *d = args; - int thread_id = d->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - // extract from input - char *in = (char *)d->in; - char *in2 = (char *)d->in2; - char *out = (char *)d->out; - int n_threads = d->n_threads; - int * ao1 = d->ant_order1; - int * ao2 = d->ant_order2; - - uint64_t oidx, iidx, ncpy = 1536; - - for (int i=thread_id*(2048/n_threads);i<(thread_id+1)*(2048/n_threads);i++) { - for (int j=0;j<3*NSNAPS/2;j++) { - iidx = i*(NSNAPS/2)*4608 + j*1536; - oidx = i*NSNAPS*4608 + ao1[j]*1536; - memcpy(out + oidx, in + iidx, ncpy); - oidx = i*NSNAPS*4608 + ao2[j]*1536; - memcpy(out + oidx, in2 + iidx, ncpy); - } - } - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_split [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -m multithread write\n" - " -i in_key\n" - " -o out_key\n" - " -j in_key2\n" - " -h print usage\n"); -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_merge", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_in2 = 0; - - // data block HDU keys - key_t in_key = CAPTURE_BLOCK_KEY; - key_t out_key = CAPTURED_BLOCK_KEY; - key_t in_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int arg = 0; - int mwrite = 0; - - while ((arg=getopt(argc,argv,"c:i:o:j:dmh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'j': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key2) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'm': - mwrite=1; - syslog (LOG_INFO, "Will do multithread write"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - hdu_in2 = dada_hdu_create (0); - dada_hdu_set_key (hdu_in2, in_key2); - if (dada_hdu_connect (hdu_in2) < 0) { - syslog (LOG_ERR,"could not connect to input buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read(hdu_in2) < 0) { - syslog (LOG_ERR, "could not lock to input buffer2"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - header_in = ipcbuf_get_next_read (hdu_in2->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in2->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // sort out ant order - int * ao1, * ao2; - ao1 = (int *)malloc(sizeof(int)*48); - ao2 = (int *)malloc(sizeof(int)*48); - ao1[0] = 19; - ao1[1] = 20; - ao1[2] = 21; - ao1[3] = 25; - ao1[4] = 26; - ao1[5] = 27; - ao1[6] = 18; - ao1[7] = 17; - ao1[8] = 16; - ao1[9] = 12; - ao1[10] = 11; - ao1[11] = 45; - ao1[12] = 83; - ao1[13] = 10; - ao1[14] = 9; - ao1[15] = 6; - ao1[16] = 5; - ao1[17] = 4; - ao1[18] = 0; - ao1[19] = 84; - ao1[20] = 85; - ao1[21] = 89; - ao1[22] = 90; - ao1[23] = 91; - ao1[24] = 39; - ao1[25] = 40; - ao1[26] = 41; - ao1[27] = 33; - ao1[28] = 34; - ao1[29] = 35; - ao1[30] = 42; - ao1[31] = 43; - ao1[32] = 44; - ao1[33] = 51; - ao1[34] = 52; - ao1[35] = 53; - ao1[36] = 57; - ao1[37] = 58; - ao1[38] = 59; - ao1[39] = 63; - ao1[40] = 64; - ao1[41] = 65; - ao1[42] = 69; - ao1[43] = 70; - ao1[44] = 71; - ao1[45] = 75; - ao1[46] = 76; - ao1[47] = 77; - ao2[0] = 22; - ao2[1] = 23; - ao2[2] = 24; - ao2[3] = 28; - ao2[4] = 29; - ao2[5] = 30; - ao2[6] = 15; - ao2[7] = 14; - ao2[8] = 13; - ao2[9] = 46; - ao2[10] = 47; - ao2[11] = 48; - ao2[12] = 82; - ao2[13] = 8; - ao2[14] = 7; - ao2[15] = 3; - ao2[16] = 2; - ao2[17] = 1; - ao2[18] = 86; - ao2[19] = 87; - ao2[20] = 88; - ao2[21] = 92; - ao2[22] = 93; - ao2[23] = 94; - ao2[24] = 95; - ao2[25] = 31; - ao2[26] = 32; - ao2[27] = 36; - ao2[28] = 37; - ao2[29] = 38; - ao2[30] = 81; - ao2[31] = 49; - ao2[32] = 50; - ao2[33] = 54; - ao2[34] = 55; - ao2[35] = 56; - ao2[36] = 60; - ao2[37] = 61; - ao2[38] = 62; - ao2[39] = 66; - ao2[40] = 67; - ao2[41] = 68; - ao2[42] = 72; - ao2[43] = 73; - ao2[44] = 74; - ao2[45] = 78; - ao2[46] = 79; - ao2[47] = 80; - - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block1, * block2, * o1, * o2; - char * output = (char *)malloc(sizeof(char)*block_out); - uint64_t written, block_id; - - // set up threads - struct data args[8]; - pthread_t threads[8]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // send through fake blocks - - /* if (fake>0) { - syslog(LOG_INFO,"sending %d fake blocks",fake); - for (int i=0;idata_block, &block_id); - memcpy(o1, output, block_out); - ipcio_close_block_write (hdu_out->data_block, block_out); - usleep(10000); - } - syslog(LOG_INFO,"Finished with fake blocks"); - }*/ - - - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - - block1 = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - block2 = ipcio_open_block_read (hdu_in2->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - - // DO STUFF - - // copy to output buffer - - if (mwrite) { - o1 = ipcio_open_block_write (hdu_out->data_block, &block_id); - } - - // set up data structure - for (int i=0; idata_block, output, block_out); - } - else { - ipcio_close_block_write (hdu_out->data_block, block_out); - } - - if (blocks % 10 == 0) - syslog(LOG_INFO, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - ipcio_close_block_read (hdu_in2->data_block, bytes_read); - - } - - free(output); - free(ao1); - free(ao2); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_in2,0); - dsaX_dbgpu_cleanup (hdu_out,1); - -} - - diff --git a/src/dsaX_nicdb.c b/src/dsaX_nicdb.c deleted file mode 100644 index df47ebe..0000000 --- a/src/dsaX_nicdb.c +++ /dev/null @@ -1,483 +0,0 @@ -/* -https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading - -gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran - -the plan is to have NCLIENTS threads listening on different threads. -each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints -the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly -The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. - -After a block is full, the data need to be written out (data rate 525 Mb/s) -The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. -switch block when one block is being written out - -*/ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#define bdepth 16 -#define MAX_FULLBLOCK 4 - -// global variables -int DEBUG = 0; -volatile int blockct[bdepth]; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW -volatile int flush_flag = 0; // set to flush output2 -volatile int writing = 0; -volatile int global_tseq = 0; // global count of full buffers -int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to -char iP[100]; -pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - -// structure to pass to threads -struct data -{ - char * output1; - char * output2; - uint16_t tport; - int thread_id; -}; - -// function prototypes -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - - -// receive process - runs infinite loop -void * process(void * ptr) -{ - - // arguments from structure - struct data *d = ptr; - int thread_id = d->thread_id; - char *output1 = (char *)d->output1; - char *output2 = (char *)d->output2; - uint16_t tport = d->tport; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); - - // set up socket - struct sockaddr_in si_other, si_me; - int clientSocket, slen=sizeof(si_other); - clientSocket=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); - if (DEBUG) syslog(LOG_INFO,"thread %d: Made socket",thread_id); - memset((char *) &si_me, 0, sizeof(si_me)); - si_me.sin_family = AF_INET; - si_me.sin_port = htons(tport); - si_me.sin_addr.s_addr = inet_addr(iP); - if (bind(clientSocket, (struct sockaddr *)&si_me, sizeof(si_me)) < 0) { - syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id); - exit(1); - } - if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound - waiting for header packet",thread_id); - - char * packet = (char *)malloc(sizeof(char)*P_SIZE); - int * ibuf; - recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen); - ibuf = (int *)(packet); - int chgroup = ibuf[0]; - syslog(LOG_INFO,"thread %d: accepted connection from chgroup %d",thread_id,chgroup); - - // data buffer and other variables - char * buffer = (char *)malloc((NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); - int tseq, pseq; - int pct = 0; - int full_blocks = 0; - int fullBlock; - int i0, aa; - int lastPacket, nextBuf, current_tseq = 0, act_tseq; - uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; - uint64_t oidx_offset, oidx; - - // infinite loop - while (1) { - - /* read message */ - // fill up local buffer - lastPacket = 0; - nextBuf = 0; - while ((lastPacket==0) && (nextBuf==0)) { - - recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen); - ibuf = (int *)(packet); - pseq = ibuf[2]; - if (chgroup != ibuf[0]) - syslog(LOG_ERR,"thread %d: received chgroup %d is not recorded %d",thread_id,ibuf[0],chgroup); - tseq = ibuf[1]; - - if (tseq>current_tseq) { - nextBuf=1; - } - else if (tseq==current_tseq) { - memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12); - pct++; - } - - if (pseq==NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)-1) - lastPacket=1; - - } - - if (pct != NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)) - syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,pct,NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)); - - act_tseq = (current_tseq * NSAMPS_PER_TRANSMIT) % NSAMPS_PER_BLOCK; // place within output buffer - - // at this stage we have a full local buffer - // this needs to be placed in the global buffer - - // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs - i0 = 0; - aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth); - oidx_offset = ((uint64_t)(aa))*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; - //syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d current_tseq %d global_tseq %d position %d %"PRIu64"",thread_id,chgroup,tseq,current_tseq,global_tseq,aa,oidx_offset); - for (int i=0;i=MAX_FULLBLOCK && blockct[i] >= (NCLIENTS-1)*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) { - - // need to write this block and reset blockct - while (flush_flag==1) - aa==1; - flush_flag = 1; - blockct[i] = 0; - // log - hardcoded bdepth - full_blocks -= 1; - syslog(LOG_INFO,"thread %d: Writing global_tseq %d. Blockcts_full %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",thread_id,global_tseq,full_blocks,blockct[0],blockct[1],blockct[2],blockct[3],blockct[4],blockct[5],blockct[6],blockct[7],blockct[8],blockct[9],blockct[10],blockct[11],blockct[12],blockct[13],blockct[14],blockct[15]); - - - } - - } - - pthread_mutex_unlock(&mutex); - - // advance local tseq and deal with packet capture - if (lastPacket==1) { - current_tseq++; - lastPacket=0; - nextBuf=0; - pct=0; - } - if (nextBuf==1) { - current_tseq++; - memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12); - pct=1; - lastPacket=0; - } - - - - } - - /* close socket and clean up */ - close(clientSocket); - free(packet); - free(buffer); - pthread_exit(0); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_nicdb [options]\n" - " -c core bind process to CPU core [no default]\n" - " -f header file [no default]\n" - " -d send debug messages to syslog\n" - " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n" - " -i IP address\n" - " -h print usage\n"); -} - - -// main part of program -int main(int argc, char ** argv) -{ - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // threads - struct data args[16]; - pthread_t threads[16]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - // deal with headers - uint64_t header_size = 4096; - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - FILE *fin; - if (!(fin=fopen(fnam,"rb"))) { - syslog(LOG_ERR,"cannot open dada header file %s",fnam); - return EXIT_FAILURE; - } - fread(header_out, 4096, 1, fin); - fclose(fin); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out); - uint64_t bytes_read = 0; - char *output1, *output2; - output1 = (char *)malloc(sizeof(char)*block_out*bdepth); - output2 = (char *)malloc(sizeof(char)*block_out); - memset(output1,0,block_out*bdepth); - memset(output2,0,block_out); - uint64_t written, block_id; - - // set up threads - - // set up data structure - for (int i=0; idata_block, output1 + (global_tseq % bdepth)*block_out, block_out); - global_tseq += 1; - writing=0; - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - syslog(LOG_INFO, "written block %d",blocks); - blocks++; - - flush_flag = 0; - - } - - - // free stuff - for(int i=0; i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; -int blockct = 0; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW -int block_switch = 0; // 0 means write to output1, write out output2. -int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to -char iP[100]; - -// structure to pass to threads -struct data -{ - char * output1; - char * output2; - uint16_t tport; - int thread_id; -}; - -// function prototypes -void dsaX_dbgpu_cleanup (dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * out) -{ - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - - -// receive process - runs infinite loop -void * process(void * ptr) -{ - - // arguments from structure - struct data *d = ptr; - int thread_id = d->thread_id; - char *output1 = (char *)d->output1; - char *output2 = (char *)d->output2; - uint16_t tport = d->tport; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); - - // set up socket - int sock = -1, conn = -1; - struct sockaddr_in address, cli; - - /* create socket */ - sock = socket(AF_INET, SOCK_STREAM, 0); - if (DEBUG) syslog(LOG_INFO,"thread %d: opened socket",thread_id); - memset(&address, 0, sizeof(struct sockaddr_in)); - address.sin_family = AF_INET; - inet_pton(AF_INET, iP, &(address.sin_addr)); - //address.sin_addr.s_addr = inet_addr("127.0.0.1"); - address.sin_port = htons(tport); - if (DEBUG) syslog(LOG_INFO,"thread %d: socket ready",thread_id); - if (bind(sock, (struct sockaddr *)&address, sizeof(struct sockaddr_in)) < 0) { - syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id); - exit(1); - } - if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound",thread_id); - listen(sock, 5); - if (DEBUG) syslog(LOG_INFO,"thread %d: socket listening on port %d",thread_id,tport); - - // accept connection - socklen_t cli_len=sizeof(struct sockaddr); - conn = accept(sock, (struct sockaddr *) &cli, &cli_len); - if (conn<0) { - syslog(LOG_ERR,"thread %d: error accepting connection",thread_id); - exit(1); - } - syslog(LOG_INFO,"thread %d: accepted connection",thread_id); - - // data buffer and other variables - char * buffer = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); - char * dblock = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char)); - int *ibuf, chgroup, tseq, oidx, iidx; - int remain_data, outptr, len; - int i0; - - // infinite loop - while (1) { - - /* read message */ - // read to buffer until all is read - remain_data =(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW); - outptr=0; - - /* - while (((len = recv(conn, dblock, remain_data, 0)) > 0) && (remain_data > 0)) { - memcpy(buffer+outptr, dblock, len); - remain_data -= len; - outptr += len; - //syslog(LOG_INFO,"Received %d of %d bytes",outptr,8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW); - }*/ - //recvlen = read(sock, buffer, sizeof(buffer)); - ibuf = (int *)(buffer); - len = recv(conn, dblock, remain_data, MSG_WAITALL); - memcpy(buffer, dblock, len); - remain_data -= len; - if (remain_data != 0) - syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,len,(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)); - - if (remain_data==0) { - - // get channel group and time sequence - chgroup = ibuf[0]; // from 0-15 - tseq = ibuf[1]; // continuous iterate over transmits - if (DEBUG) syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d blockct %d",thread_id,chgroup,tseq,blockct); - tseq = (tseq * 128) % 4096; // place within output - - // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs - i0 = 8; - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - // deal with headers - uint64_t header_size = 4096; - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - FILE *fin; - if (!(fin=fopen(fnam,"rb"))) { - syslog(LOG_ERR,"cannot open dada header file %s",fnam); - return EXIT_FAILURE; - } - fread(header_out, 4096, 1, fin); - fclose(fin); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out); - uint64_t bytes_read = 0; - char *output1, *output2; - output1 = (char *)malloc(sizeof(char)*block_out); - output2 = (char *)malloc(sizeof(char)*block_out); - memset(output1,0,block_out); - memset(output2,0,block_out); - uint64_t written, block_id; - - // set up threads - - // set up data structure - for (int i=0; i=NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) { - - // change output - bswitch= block_switch; - blockct=0; - if (bswitch==0) block_switch=1; - if (bswitch==1) block_switch=0; - - // write to output - if (bswitch==0) written = ipcio_write (hdu_out->data_block, output1, block_out); - if (bswitch==1) written = ipcio_write (hdu_out->data_block, output2, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - ctt=0; - } - - } - - // free stuff - for(int i=0; i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -// data to pass to threads -struct data { - char * in; - char * out; - int n_threads; - int thread_id; - int debug; -}; - -/* global variables */ -int DEBUG = 0; -int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int dbg = d->debug; - - // masks for fluffing - __m512i masks[4]; - masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL); - masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL); - masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL); - masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL); - - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - char *out = (char *)d->out; - int nthreads = d->n_threads; - - /* DO ALL PROCESSING - - "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) - "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i - parallelize by splitting on NPACKETS axis. - - */ - - // input and output index and extracted data - int idx = thread_id; // PACKET idx for input and output - char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data - char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data - - // extract data - memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); - if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: extracted data",thread_id); - - // do fluffing - - /* - technique is to use nybble masks to - (a) unmask every fourth nybble - (b) bit shift to left using mm512_slli_epi16 - (c) sign extend by 4 bits using mm512_srai_epi16 - (d) bit shift to right - - Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data - - */ - - // variables - char * low = (char *)malloc(sizeof(char)*64); // m512 - char * hi = (char *)malloc(sizeof(char)*64); // m512 - __m512i low_m, hi_m; - unsigned short * low_u = (unsigned short *)(low); - unsigned short * hi_u = (unsigned short *)(hi); - __m512i v[4]; // for 4 packed 4-bit numbers - - // input and output - __m512i proc_m; - unsigned short * fluffed_u = (unsigned short *)(fluffed_data); - - // numbers to iterate over - int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64; - - if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: ready to fluff",thread_id); - - // let's do it! - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - return EXIT_FAILURE; - } - - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // set up data structure - for (int i=0; idata_block, output_buffer, block_out); - - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - -} - - diff --git a/src/dsaX_reorder_raw.c b/src/dsaX_reorder_raw.c deleted file mode 100644 index c0f6b0c..0000000 --- a/src/dsaX_reorder_raw.c +++ /dev/null @@ -1,613 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -// Forward declaration to keep compiler happy -// Possible minor bug in PSRDada -int ipcio_check_pending_sod (ipcio_t* ); -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -// data to pass to threads -struct data { - char * in; - char * out; - int n_threads; - int thread_id; - int debug; - int write; - ipcio_t * ipc; -}; - -/* global variables */ -int DEBUG = 0; -int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -b connect to bf hdu\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int na = 64; // output ants - int dbg = d->debug; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - char *out = (char *)d->out; - int nthreads = d->n_threads; - - /* DO ALL PROCESSING - - "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) - "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i - parallelize by splitting on NPACKETS axis. - - */ - - // input and output index and extracted data - int idx = thread_id; // PACKET idx for input and output - //char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data - //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data - //char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data - - // extract data - //memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); - - // do fluffing in dumbest possible way - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); - - // let's do it! - int in_idx, out_idx, a1, a2, a3, a4, a5, a6; - int in_offset = idx*(NPACKETS/nthreads)*NANTS*(384*2)*2; - int out_offset = idx*(NPACKETS/nthreads)*(384*2)*na*2; - for (int i=0;i<(NPACKETS/nthreads);i++) { - a1 = i*NANTS*1536; - a2 = i*na*1536; - for (int j=0;jipc->curbuf[out_offset+out_idx] = in[in_offset+in_idx]; - //d->ipc->curbuf[out_offset+2*out_idx+1] = in[in_offset+in_idx] >> 4; - - } - } - } - } - - /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data - - fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4; - fluffed_data[2*i+1] = proc_data[i] >> 4; - - }*/ - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id); - - // transpose antennas and frequencies by ints - // from fluffed_data to out_data - /* int * fluffed_int = (int *)(fluffed_data); - memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2); - int * out_int = (int *)out_data;*/ - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id); - - // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose - /* int tile_size = 3; // set by benchmarking - for (int i_packet=0;i_packetwrite) - memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - else - memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - */ - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); - - // free stuff - //free(proc_data); - //free(fluffed_data); - //free(out_data); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // TESTING and initialization - // threads - struct data args[16]; - pthread_t threads[16]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // run test with single thread - - /*syslog(LOG_INFO,"Running TEST...\n"); - - // set up data structure - char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); - char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); - memset(test_block,0,sizeof(test_block)); - - TEST CODE - FILE *fin; - fin=fopen("../utils/packet.out","rb"); - fread(test_block, 96768, 1, fin); - fclose(fin); - END TEST CODE - - args[0].in = test_block; - args[0].out = test_output; - args[0].n_threads = 1; - args[0].thread_id = 0; - args[0].debug = 0; - args[0].write = 0; - - // run test thread - if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { - syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); - } - else - syslog(LOG_INFO,"Created TEST thread\n"); - pthread_attr_destroy(&attr); - pthread_join(threads[0], &result); - syslog(LOG_INFO,"joined TEST thread"); - - TEST CODE - fin=fopen("../utils/test.out","wb"); - fwrite(test_output, 1, 196608, fin); - fclose(fin); - END TEST CODE - - // clean up - free(test_block); - free(test_output); - - syslog(LOG_INFO,"TEST COMPLETE");*/ - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_out2 = 0; - - // data block HDU keys - key_t in_key = CAPTURED_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int nthreads = 1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - nthreads = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_INFO, "Will excrete all debug messages"); - break; - - case 'q': - syslog (LOG_INFO, "Quit here"); - return EXIT_SUCCESS; - - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - if (bf) { - hdu_out2 = dada_hdu_create (0); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer2"); - return EXIT_FAILURE; - } - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * blockie; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // sort out write - hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); - hdu_out->data_block->marked_filled = 0; - //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); - - // set up data structure - for (int i=0; idata_block; - args[i].write = 1; - } - - if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); - - for(int i=0; idata_block, output_buffer, block_out); - - if (bf) { - - written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - } - - // finish write - ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); - ipcio_check_pending_sod (hdu_out->data_block); - hdu_out->data_block->marked_filled = 1; - //ipcio_close_block_write(hdu_out->data_block, block_out); - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - -} - - diff --git a/src/dsaX_reorder_raw.c.bak b/src/dsaX_reorder_raw.c.bak deleted file mode 100644 index 0914823..0000000 --- a/src/dsaX_reorder_raw.c.bak +++ /dev/null @@ -1,672 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -// data to pass to threads -struct data { - char * in; - char * out; - int n_threads; - int thread_id; - int debug; - int write; - ipcio_t * ipc; -}; - -/* global variables */ -int DEBUG = 0; -int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -b connect to bf hdu\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int na = 64; // output ants - int dbg = d->debug; - - // masks for fluffing - __m512i masks[4]; - masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL); - masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL); - masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL); - masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL); - - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - char *out = (char *)d->out; - int nthreads = d->n_threads; - - /* DO ALL PROCESSING - - "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) - "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i - parallelize by splitting on NPACKETS axis. - - */ - - // input and output index and extracted data - int idx = thread_id; // PACKET idx for input and output - char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data - char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data - char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data - - // extract data - memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); - - // do fluffing - - /* - technique is to use nybble masks to - (a) unmask every fourth nybble - (b) bit shift to left using mm512_slli_epi16 - (c) sign extend by 4 bits using mm512_srai_epi16 - (d) bit shift to right - - Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data - - */ - - // variables - char * low = (char *)malloc(sizeof(char)*64); // m512 - char * hi = (char *)malloc(sizeof(char)*64); // m512 - __m512i low_m, hi_m; - unsigned short * low_u = (unsigned short *)(low); - unsigned short * hi_u = (unsigned short *)(hi); - __m512i v[4]; // for 4 packed 4-bit numbers - - // input and output - __m512i proc_m; - unsigned short * fluffed_u = (unsigned short *)(fluffed_data); - - // numbers to iterate over - int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64; - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); - - // let's do it! - for (int i=0;iwrite) - memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - else - memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); - - // free stuff - free(proc_data); - free(fluffed_data); - free(out_data); - free(low); - free(hi); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // TESTING and initialization - // threads - struct data args[16]; - pthread_t threads[16]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // run test with single thread - - syslog(LOG_INFO,"Running TEST...\n"); - - // set up data structure - char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); - char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); - memset(test_block,0,sizeof(test_block)); - - /* TEST CODE - FILE *fin; - fin=fopen("../utils/packet.out","rb"); - fread(test_block, 96768, 1, fin); - fclose(fin); - END TEST CODE */ - - args[0].in = test_block; - args[0].out = test_output; - args[0].n_threads = 1; - args[0].thread_id = 0; - args[0].debug = 0; - args[0].write = 0; - - // run test thread - if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { - syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); - } - else - syslog(LOG_INFO,"Created TEST thread\n"); - pthread_attr_destroy(&attr); - pthread_join(threads[0], &result); - syslog(LOG_INFO,"joined TEST thread"); - - /* TEST CODE - fin=fopen("../utils/test.out","wb"); - fwrite(test_output, 1, 196608, fin); - fclose(fin); - END TEST CODE */ - - // clean up - free(test_block); - free(test_output); - - syslog(LOG_INFO,"TEST COMPLETE"); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_out2 = 0; - - // data block HDU keys - key_t in_key = CAPTURED_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int nthreads = 1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - nthreads = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_INFO, "Will excrete all debug messages"); - break; - - case 'q': - syslog (LOG_INFO, "Quit here"); - return EXIT_SUCCESS; - - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - if (bf) { - hdu_out2 = dada_hdu_create (); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer2"); - return EXIT_FAILURE; - } - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * blockie; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // sort out write - hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); - hdu_out->data_block->marked_filled = 0; - //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); - - // set up data structure - for (int i=0; idata_block; - args[i].write = 1; - } - - if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); - - for(int i=0; idata_block, output_buffer, block_out); - - if (bf) { - - written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - } - - // finish write - ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); - ipcio_check_pending_sod (hdu_out->data_block); - hdu_out->data_block->marked_filled = 1; - //ipcio_close_block_write(hdu_out->data_block, block_out); - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - -} - - diff --git a/src/dsaX_reorder_raw.c.bak2 b/src/dsaX_reorder_raw.c.bak2 deleted file mode 100644 index 54ad886..0000000 --- a/src/dsaX_reorder_raw.c.bak2 +++ /dev/null @@ -1,608 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -// data to pass to threads -struct data { - char * in; - char * out; - int n_threads; - int thread_id; - int debug; - int write; - ipcio_t * ipc; -}; - -/* global variables */ -int DEBUG = 0; -int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -b connect to bf hdu\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - int na = 64; // output ants - int dbg = d->debug; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - char *out = (char *)d->out; - int nthreads = d->n_threads; - - /* DO ALL PROCESSING - - "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times) - "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i - parallelize by splitting on NPACKETS axis. - - */ - - // input and output index and extracted data - int idx = thread_id; // PACKET idx for input and output - char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data - //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data - char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data - - // extract data - memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2); - if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id); - - // do fluffing in dumbest possible way - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id); - - // let's do it! - int in_idx, out_idx, a1, a2, a3, a4, a5, a6; - for (int i=0;i<(NPACKETS/nthreads);i++) { - a1 = i*NANTS*1536; - a2 = i*na*1536; - for (int j=0;j> 4; - out_data[2*out_idx+1] = proc_data[in_idx] >> 4; - - } - } - } - } - - /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data - - fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4; - fluffed_data[2*i+1] = proc_data[i] >> 4; - - }*/ - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id); - - // transpose antennas and frequencies by ints - // from fluffed_data to out_data - /* int * fluffed_int = (int *)(fluffed_data); - memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2); - int * out_int = (int *)out_data;*/ - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id); - - // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose - /* int tile_size = 3; // set by benchmarking - for (int i_packet=0;i_packetwrite) - memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - else - memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2); - - if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id); - - // free stuff - free(proc_data); - //free(fluffed_data); - free(out_data); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // TESTING and initialization - // threads - struct data args[16]; - pthread_t threads[16]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // run test with single thread - - syslog(LOG_INFO,"Running TEST...\n"); - - // set up data structure - char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2); - char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2); - memset(test_block,0,sizeof(test_block)); - - /* TEST CODE - FILE *fin; - fin=fopen("../utils/packet.out","rb"); - fread(test_block, 96768, 1, fin); - fclose(fin); - END TEST CODE */ - - args[0].in = test_block; - args[0].out = test_output; - args[0].n_threads = 1; - args[0].thread_id = 0; - args[0].debug = 0; - args[0].write = 0; - - // run test thread - if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) { - syslog(LOG_ERR,"Failed to create TEST massage thread 0\n"); - } - else - syslog(LOG_INFO,"Created TEST thread\n"); - pthread_attr_destroy(&attr); - pthread_join(threads[0], &result); - syslog(LOG_INFO,"joined TEST thread"); - - /* TEST CODE - fin=fopen("../utils/test.out","wb"); - fwrite(test_output, 1, 196608, fin); - fclose(fin); - END TEST CODE */ - - // clean up - free(test_block); - free(test_output); - - syslog(LOG_INFO,"TEST COMPLETE"); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_out2 = 0; - - // data block HDU keys - key_t in_key = CAPTURED_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int nthreads = 1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - nthreads = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_INFO, "Will excrete all debug messages"); - break; - - case 'q': - syslog (LOG_INFO, "Quit here"); - return EXIT_SUCCESS; - - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - if (bf) { - hdu_out2 = dada_hdu_create (); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer2"); - return EXIT_FAILURE; - } - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * blockie; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // sort out write - hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); - hdu_out->data_block->marked_filled = 0; - //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id); - - // set up data structure - for (int i=0; idata_block; - args[i].write = 1; - } - - if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads); - - for(int i=0; idata_block, output_buffer, block_out); - - if (bf) { - - written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - } - - // finish write - ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); - ipcio_check_pending_sod (hdu_out->data_block); - hdu_out->data_block->marked_filled = 1; - //ipcio_close_block_write(hdu_out->data_block, block_out); - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - -} - - diff --git a/src/dsaX_simplesplit.c b/src/dsaX_simplesplit.c deleted file mode 100644 index 7a80c7e..0000000 --- a/src/dsaX_simplesplit.c +++ /dev/null @@ -1,362 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -/* global variables */ -int DEBUG = 0; - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_split [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -b connect to bf hdu\n" - " -i in_key [default CAPTURE_BLOCK_KEY]\n" - " -o out_key [default CAPTURED_BLOCK_KEY]\n" - " -j out_key2 [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_simplesplit", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_out2 = 0; - - // data block HDU keys - key_t in_key = CAPTURE_BLOCK_KEY; - key_t out_key = CAPTURED_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:i:o:j:dbh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'j': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key2) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - if (bf) { - hdu_out2 = dada_hdu_create (0); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer2"); - return EXIT_FAILURE; - } - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * o1, * o2; - output_buffer = (char *)malloc(sizeof(char)*block_out); - char * output = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - - // DO STUFF - - - // copy to output buffer - memcpy(output_buffer, block, block_size); - - // do write - written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - if (bf) - written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - free(output); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - -} - - diff --git a/src/dsaX_splice.c b/src/dsaX_splice.c deleted file mode 100644 index b91e665..0000000 --- a/src/dsaX_splice.c +++ /dev/null @@ -1,201 +0,0 @@ -/* This works pretty much like the trigger code. receives a control UDP message -to store some data for a fixed amount of time. -Message format: length(s)-NAME -Will ignore messages until data recording is over -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include - - -FILE *output; - -void send_string(char *string) /* includefile */ -{ - int len; - len=strlen(string); - fwrite(&len, sizeof(int), 1, output); - fwrite(string, sizeof(char), len, output); -} - -void send_float(char *name,float floating_point) /* includefile */ -{ - send_string(name); - fwrite(&floating_point,sizeof(float),1,output); -} - -void send_double (char *name, double double_precision) /* includefile */ -{ - send_string(name); - fwrite(&double_precision,sizeof(double),1,output); -} - -void send_int(char *name, int integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(int),1,output); -} - -void send_char(char *name, char integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(char),1,output); -} - - -void send_long(char *name, long integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(long),1,output); -} - -void send_coords(double raj, double dej, double az, double za) /*includefile*/ -{ - if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); - if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); - if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); - if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); -} - - -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -int trignum = 0; -int dumpnum = 0; -char iP[100]; -char srcnam[1024]; -float reclen; -int DEBUG = 0; - -void usage() -{ - fprintf (stdout, "dsaX_splice [16 files]\n"); -} - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_splice", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // set up input array - // 16 corrs, 3840 times, 256 beams, 48 chans - char * bigarr = (char *)malloc(sizeof(char)*16*3840*256*48); - char foutnam[200]; - - // read into input array - FILE *fin; - for (int i=1;i<17;i++) { - fin=fopen(argv[i],"rb"); - fread(bigarr+(i-1)*3840*256*48,3840*256*48,1,fin); - fclose(fin); - } - - // reorder bigarr - char * tarr = (char *)malloc(sizeof(char)*16*3840*256*48); - int oidx, iidx; - // order is beam, time, freq - for (int i=0;i<16;i++) { - for (int j=0;j<3840;j++) { - for (int k=0;k<256;k++) { - - iidx = i*3840*256*48 + j*256*48 + k*48; - oidx = k*3840*768 + j*768 + i*48; - memcpy(tarr + oidx, bigarr + iidx, 48); - - } - } - } - free(bigarr); - - // loop over beams and write out all filterbanks - for (int i=0;i<256;i++) { - - sprintf(foutnam,"/home/ubuntu/data/fb_%d.fil",i); - - if (!(output = fopen(foutnam,"wb"))) { - printf("Couldn't open output file\n"); - return 0; - } - - send_string("HEADER_START"); - send_string("source_name"); - sprintf(srcnam,"fb_%d",i); - send_string(srcnam); - send_int("machine_id",1); - send_int("telescope_id",82); - send_int("data_type",1); // filterbank data - send_double("fch1",1498.75); // THIS IS CHANNEL 0 :) - send_double("foff",-0.244140625); - send_int("nchans",768); - send_int("nbits",8); - send_double("tstart",55000.0); - send_double("tsamp",8.192e-6*8.*16.); - send_int("nifs",1); - send_string("HEADER_END"); - - fwrite(tarr + i*2949120,2949120,1,output); - fclose(output); - - } - - // write out full filterbank - sprintf(foutnam,"/home/ubuntu/data/fb_all.fil"); - - if (!(output = fopen(foutnam,"wb"))) { - printf("Couldn't open output file\n"); - return 0; - } - - send_string("HEADER_START"); - send_string("source_name"); - sprintf(srcnam,"fb_all"); - send_string(srcnam); - send_int("machine_id",1); - send_int("telescope_id",82); - send_int("data_type",1); // filterbank data - send_double("fch1",1498.75); // THIS IS CHANNEL 0 :) - send_double("foff",-0.244140625); - send_int("nchans",768); - send_int("nbits",8); - send_double("tstart",55000.0); - send_double("tsamp",8.192e-6*8.*16.); - send_int("nifs",1); - send_string("HEADER_END"); - - fwrite(tarr,16*3840*256*48,1,output); - fclose(output); - - - free(tarr); - -} diff --git a/src/dsaX_split.c b/src/dsaX_split.c deleted file mode 100644 index 1361e86..0000000 --- a/src/dsaX_split.c +++ /dev/null @@ -1,601 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -/* global variables */ -int DEBUG = 0; -int STATS = 0; -const int nth = 4; - -// data to pass to threads -struct data { - char * in; - char * out; - char * out2; - int bf; - int reorder; - int n_threads; - int thread_id; -}; -int cores[8] = {10, 11, 12, 13, 14, 15, 16, 17}; - - -void * massage (void *args) { - - struct data *d = args; - int thread_id = d->thread_id; - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - // extract from input - char *in = (char *)d->in; - int bf = d->bf; - int reorder = d->reorder; - int n_threads = d->n_threads; - - if (!reorder) { - memcpy(d->out + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT); - if (bf) - memcpy(d->out2 + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT); - } - else { - - // block for transpose - int block = 16; - - for (int i=(int)(thread_id*(2048/n_threads));i<(int)((thread_id + 1)*2048/n_threads);i++) { // over time - for (int i1 = 0; i1 < 48; i1 += block) { - for(int j = 0; j < NANT; j++) { - for(int b = 0; b < block && i1 + b < 48; b++) { - memcpy(d->out + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32); - if (bf) memcpy(d->out2 + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32); - } - } - } - } - - } - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); -void reorder_block(char *block, char *output); -void calc_stats(char *block); - -// calculates rms for each pol from the first packet in each block. -// block has shape [2048 time, NANT antennas, 768 channels, 2 pol, r/i] -void calc_stats(char *input) { - - float rmss[NANT*2]; - int iidx; - for (int i=0;i> 4),2.); - rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 240))) >> 4),2.); - - } - } - } - - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - if (bf) { - hdu_out2 = dada_hdu_create (0); - dada_hdu_set_key (hdu_out2, out_key2); - if (dada_hdu_connect (hdu_out2) < 0) { - syslog (LOG_ERR,"could not connect to output buffer2"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out2) < 0) { - syslog (LOG_ERR, "could not lock to output buffer2"); - return EXIT_FAILURE; - } - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - uint64_t nints = block_size / block_out; - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * o1, * o2; - output_buffer = (char *)malloc(sizeof(char)*block_out); - char * output = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // set up threads - struct data args[8]; - pthread_t threads[8]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // send through fake blocks - - if (fake>0) { - syslog(LOG_INFO,"sending %d fake blocks",fake); - for (int i=0;idata_block, &block_id); - memcpy(o1, output, block_out); - ipcio_close_block_write (hdu_out->data_block, block_out); - usleep(10000); - } - syslog(LOG_INFO,"Finished with fake blocks"); - } - - - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - - // DO STUFF - - for (int myint=0;myintdata_block, &block_id); - if (bf) o2 = ipcio_open_block_write (hdu_out2->data_block, &block_id); - } - - // stats - if (STATS) calc_stats(output_buffer); - - //if (reorder) { - - // set up data structure - for (int i=0; idata_block, output, block_out); - else - written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - - if (bf) { - written = ipcio_write (hdu_out->data_block, output_buffer, block_out); - if (reorder) - written = ipcio_write (hdu_out2->data_block, output, block_out); - else - written = ipcio_write (hdu_out2->data_block, output_buffer, block_out); - } - } - else { - ipcio_close_block_write (hdu_out->data_block, block_out); - if (bf) ipcio_close_block_write (hdu_out2->data_block, block_out); - } - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - } - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - free(output); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - -} - - diff --git a/src/dsaX_splitup.c b/src/dsaX_splitup.c deleted file mode 100644 index 32f055d..0000000 --- a/src/dsaX_splitup.c +++ /dev/null @@ -1,285 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_splitup", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - uint64_t nsplits = block_size/block_out; - char * block, * output_buffer; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // do multiple writes - - for (uint64_t i=0;idata_block, output_buffer, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - } - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_store.c b/src/dsaX_store.c deleted file mode 100644 index 849c27c..0000000 --- a/src/dsaX_store.c +++ /dev/null @@ -1,218 +0,0 @@ -/* Code to read from a raw data buffer and write to disk */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_dbdisk [options]\n" - " -c core bind process to CPU core\n" - " -k in_key [default fafa]\n" - " -h print usage\n"); -} - - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_store", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - - // input data block HDU key - key_t in_key = 0x0000fafa; - - // command line arguments - uint64_t blocksize; - uint64_t bout = 32*NSNAPS*4608; // output block size - assume input is a multiple. - int core = -1; - int arg=0; - - while ((arg=getopt(argc,argv,"c:k:h")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - printf ("ERROR: -c flag requires argument\n"); - return EXIT_FAILURE; - } - case 'k': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-k flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // DADA stuff - - // open connection to the in/read DB - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to input buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"dsaX_correlator_copy: could not lock to input buffer"); - return EXIT_FAILURE; - } - - // Bind to cpu core - if (core >= 0) - { - syslog(LOG_INFO,"binding to core %d", core); - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"dsaX_correlator_copy: failed to bind to core %d",core); - } - - // more DADA stuff - deal with headers - - uint64_t header_size = 0; - - // read the header from the input HDU - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "main: could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - // mark the input header as cleared - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared [input]"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - int observation_complete=0; - - // stuff for writing data - blocksize = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - char * cpbuf = (char *)malloc(sizeof(char)*blocksize); - char * outbuf = (char *)malloc(sizeof(char)*bout); - int ngulps = (int)(blocksize/bout); - int gulp = 0, wseq = 0;; - char *in_data; - uint64_t written=0, written2=0; - uint64_t block_id, bytes_read=0; - FILE *fout; - char fnam[100]; - - - syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout); - - - // main reading loop - - syslog(LOG_INFO, "main: starting read"); - - while (!observation_complete) { - - // read a DADA block - in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - // copy - memcpy(cpbuf, in_data, blocksize); - syslog(LOG_INFO, "starting new write (seq %d)",wseq); - - // open file for writing - sprintf(fnam,"/home/ubuntu/data/fl_%d.out",wseq); - fout = fopen(fnam,"wb"); - for (gulp=0;gulpdata_block, bytes_read); - - } - - free(cpbuf); - free(outbuf); - dsaX_dbgpu_cleanup (hdu_in); - -} - diff --git a/src/dsaX_testdada.c b/src/dsaX_testdada.c deleted file mode 100644 index bbe7640..0000000 --- a/src/dsaX_testdada.c +++ /dev/null @@ -1,161 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" -#include "xgpu.h" - -// print fn -void print_arr(char *ptr, int len) { - printf("\n["); - for (int i = 0; i < len; i++) { - printf(" %08x,", ptr[i]); - } - printf(" ]\n"); -} - -// read and write functions - -int write_block(dada_hdu_t* hdu_in) { - - dada_hdu_lock_write(hdu_in); - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - char * data = (char *)malloc(sizeof(char)*block_size); - memset(data, 0, block_size); - ipcio_write (hdu_in->data_block, data, block_size); - free(data); - dada_hdu_unlock_write (hdu_in); - -} - -int read_block(dada_hdu_t* hdu_in) { - - dada_hdu_lock_read(hdu_in); - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - char * data = (char *)malloc(sizeof(char)*block_size); - char * block; - uint64_t bytes_read, block_id; - - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - memcpy(data, block, bytes_read); - print_arr(data, (int)(bytes_read)); - - free(data); - ipcio_close_block_read (hdu_in->data_block, bytes_read); - dada_hdu_unlock_read (hdu_in); - -} - - - -// MAIN - -int main (int argc, char *argv[]) { - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - - // command line arguments - int arg = 0; - char *hout; - hout = (char *)malloc(sizeof(char)*4096); - - - while ((arg=getopt(argc,argv,"i:h:")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - sscanf (optarg, "%x", &in_key); - break; - } - case 'h': - if (optarg) - { - fileread (optarg, hout, 4096); - break; - } - } - } - - // DADA stuff - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - dada_hdu_connect (hdu_in); - - /* - // deal with header - dada_hdu_lock_write(hdu_in); - char * header_out = ipcbuf_get_next_write (hdu_in->header_block); - memcpy (header_out, hout, 4096); - ipcbuf_mark_filled (hdu_in->header_block, 4096); - dada_hdu_unlock_write(hdu_in); - free(hout); - - dada_hdu_lock_read(hdu_in); - uint64_t header_size = 0; - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - ipcbuf_mark_cleared (hdu_in->header_block); - dada_hdu_unlock_read(hdu_in); - */ - - // do four reads and four writes - - while (1) { - - printf("writing four blocks... "); - for (int i=0;i<4;i++) { - write_block(hdu_in); - sleep(0.5); - } - printf("written\n"); - - sleep(2); - - printf("reading four blocks... "); - for (int i=0;i<4;i++) { - read_block(hdu_in); - sleep(0.5); - } - printf("read\n"); - - } - -} - - diff --git a/src/dsaX_trigger.c b/src/dsaX_trigger.c deleted file mode 100644 index 9592389..0000000 --- a/src/dsaX_trigger.c +++ /dev/null @@ -1,585 +0,0 @@ -/* Code to read from a single dada buffer, and write to disk upon receiving -a trigger. Uses pthread threads and shared memory to listen. -Sequence of events: - - starts null-reading dump buffer, while listening for socket command - + for N second dump, assume N-second dada blocks - - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. - - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dsaX_capture.h" -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" - -// data to pass to threads -struct cdata { - char * in; - dada_hdu_t * hdu_out; -}; - - -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -uint64_t specnum = 0; -uint64_t procnum = 0; -int trignum = 0; -int dumpnum = 0; -char iP[100]; -char footer_buf[1024]; -int DEBUG = 0; -volatile int docopy = 0; -volatile int dumping = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_out"); - } - dada_hdu_destroy (out); - - - -} - -void usage() -{ - fprintf (stdout, - "dsaX_correlator_trigger [options]\n" - " -c core bind process to CPU core\n" - " -i IP to listen to [no default]\n" - " -j in_key [default eaea]\n" - " -o out_key [default fafa]\n" - " -d debug\n" - " -f full_pct [default 0.8]\n" - " -n output file name [no default]\n" - " -s skip N blocks [default 0]\n" - " -h print usage\n"); -} - -// thread to control writing of data to buffer - -void copy_thread (void * arg) { - - struct cdata *d = arg; - char *in = (char *)d->in; - dada_hdu_t * hdu_out = (dada_hdu_t *)d->hdu_out; - - uint64_t written = 0; - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO,"in thread... blocksize %"PRIu64"",block_size); - - while (1) { - - while (docopy==0) usleep(100); - - written = ipcio_write (hdu_out->data_block, in, block_size); - - dumping = 0; - dump_pending = 0; - docopy=0; - - syslog(LOG_INFO,"Finished writing trigger"); - - } - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - - -} - -// Thread to control the dumping of data - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = ctx->control_port; - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - char* tbuf = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,"11227",&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - uint64_t tmps; - char * token; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - memset(tbuf,0,bufsize); - strcpy(tbuf,buffer); - trignum++; - - // interpret buffer string - char * rest = buffer; - tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0)); - - if (!dump_pending) { - //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16); - specnum = tmps; - strcpy(footer_buf,tbuf); - syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum); - } - - if (dump_pending) - syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps); - - if (!dump_pending) dump_pending = 1; - - close(fd); - - } - - free (buffer); - free (tbuf); - - if (ctx->verbose) - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_trigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - /* port for control commands */ - int control_port = TRIGGER_CONTROL_PORT; - - /* actual struct with info */ - udpdb_t udpdb; - - // input data block HDU key - key_t in_key = 0x0000eaea; - key_t out_key = 0x0000fafa; - - // command line arguments - int core = -1; - float full_pct = 0.8; - int arg=0; - int skips = 0; - - while ((arg=getopt(argc,argv,"i:c:j:o:f:d:s:h")) != -1) - { - switch (arg) - { - case 'i': - strcpy(iP,optarg); - break; - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -c flag requires argument\n"); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - full_pct = atof(optarg); - syslog(LOG_INFO,"Using full_pct %f",full_pct); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -f flag requires argument\n"); - return EXIT_FAILURE; - } - case 's': - if (optarg) - { - skips = atoi(optarg); - break; - } - else - { - syslog (LOG_ERR,"ERROR: -s flag requires argument\n"); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_INFO, "Will excrete all debug messages"); - break; - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'j': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-j flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // DADA stuff - - udpdb.verbose = DEBUG; - udpdb.control_port = control_port; - - // start control thread - int rval = 0; - pthread_t control_thread_id; - syslog(LOG_INFO, "starting control_thread()"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - - - syslog (LOG_INFO, "creating hdus"); - - // open connection to the in/read DBs - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - syslog (LOG_ERR,"could not lock4 to eada buffer"); - return EXIT_FAILURE; - } - - // Bind to cpu core - if (core >= 0) - { - syslog(LOG_INFO,"binding to core %d", core); - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - } - - int observation_complete=0; - - // more DADA stuff - deal with headers - - uint64_t header_size = 0; - - // read the header from the input HDU - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "main: could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // now write the output DADA header - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // copy the in header to the out header - memcpy (header_out, header_in, header_size); - - // mark the input header as cleared - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared [input]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // mark the output header buffer as filled - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // stuff for writing data - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - uint64_t specs_per_block = 2048; - uint64_t specs_per_out = 2048*NOUTBLOCKS; - uint64_t current_specnum = 0; // updates with each dada block read - uint64_t start_byte, bytes_to_copy, bytes_copied=0; - char * out_data = (char *)malloc(sizeof(char)*block_out); - char * in_data; - uint64_t written=0; - uint64_t block_id, bytes_read=0; - FILE *ofile; - ofile = fopen("/home/ubuntu/data/dumps.dat","w"); - fprintf(ofile,"starting...\n"); - fclose(ofile); - - - // thread for copying data - struct cdata cstruct; - cstruct.in = out_data; - cstruct.hdu_out = hdu_out; - rval = 0; - pthread_t copy_thread_id; - syslog(LOG_INFO, "starting copy_thread()"); - rval = pthread_create (©_thread_id, 0, (void *) copy_thread, (void *) &cstruct); - if (rval != 0) { - syslog(LOG_ERR, "Error creating copy_thread: %s", strerror(rval)); - return -1; - } - - - // main reading loop - float pc_full = 0.; - int block_count = 0; - syslog(LOG_INFO, "main: starting observation"); - - while (!observation_complete) { - - // read a DADA block - in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // add delay - // only proceed if input data block is 80% full - while (pc_full < full_pct) { - pc_full = ipcio_percent_full(hdu_in->data_block); - usleep(100); - } - pc_full = 0.; - - - // check for dump_pending - if (dump_pending) { - - // look after hand trigger - if (specnum==0) { - - specnum = current_specnum + 100; - - } - - // if this is the first block to dump - if (specnum >= current_specnum && specnum < current_specnum+specs_per_block) { - - dumping = 1; - - // find start byte and bytes to copy - start_byte = 4608*NSNAPS*(specnum-current_specnum); - bytes_to_copy = block_size-start_byte; - - // do copy - memcpy(out_data, in_data+start_byte, bytes_to_copy); - //written = ipcio_write (hdu_out->data_block, in_data+start_byte, bytes_to_copy); - bytes_copied = bytes_to_copy; - - } - - // if this is one of the middle blocks to dump from - if (specnum < current_specnum && specnum + specs_per_out > current_specnum + specs_per_block && dumping==1) { - - // do copy - memcpy(out_data + bytes_copied, in_data, block_size); - //written = ipcio_write (hdu_out->data_block, in_data, block_size); - bytes_copied += block_size; - - } - - // if this is the last block to dump from - if (specnum + specs_per_out > current_specnum && specnum + specs_per_out <= current_specnum + specs_per_block && dumping==1) { - - // find start byte and bytes to copy - bytes_to_copy = block_out-bytes_copied; - - // do copy - memcpy(out_data+bytes_copied, in_data, bytes_to_copy); - //written = ipcio_write (hdu_out->data_block, in_data, bytes_to_copy); - - // DO THE WRITING - /*written = ipcio_write (hdu_out->data_block, out_data, block_out); - - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - */ - - // DO writing using thread - docopy = 1; - - syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf); - ofile = fopen("/home/ubuntu/data/dumps.dat","a"); - fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf); - fclose(ofile); - - dumpnum++; - - // reset - bytes_copied = 0; - - } - - // if trigger arrived too late - if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) { - syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum); - - bytes_copied=0; - dump_pending=0; - - } - - - } - - // update current spec - syslog(LOG_INFO,"current_specnum %lu",current_specnum); - if (block_count < skips) { - block_count++; - } - else - current_specnum += specs_per_block; - - - // for exiting - if (bytes_read < block_size) { - observation_complete = 1; - syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size); - } - - // close block for reading - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - - } - - - // close threads - syslog(LOG_INFO, "joining control_thread"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - result=0; - pthread_join (copy_thread_id, &result); - - free(out_data); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} diff --git a/src/dsaX_wrangle b/src/dsaX_wrangle deleted file mode 100755 index f839b14c334758201c3b8885fb58a899eb6e804d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 99600 zcmeEvdtg-6@&C<>gaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_ z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z| z(<};uPnN~pEqRK^a#^}umM&+ehL+zpKb-~~KW0i=ev01rm-Bk{ddcpU zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;= zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>} z;A^&6gah&aTO9hiap=?H(7zG~zA6rU zS{(S&R_Wy9G9>_mO#*uSg9QscIKe9h_hLExU}IN-9dL zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_ z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM; z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq# zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=- z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8 zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg zYZmjf-T^^a=;IEz`GsrLmcq113tj2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6* z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_ zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V zlE0aGhNy^J^4BxZ&=kp({58xU#=IfzQZhiF8W-Qsx7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N ze-?R$fJnFGKVzOrqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~ zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^ zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT zV}Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8( zai zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3rsd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{ zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0# z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE z;G+aqAus$7u#3E{khk?mt|#&VAcKXBG}TiMu?!Azj}fl1 zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~ zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H zMzJ@Pas*QEGYDyBR`F|1vRTzEQc zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)mBF zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@ z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>kaANH99`*Z{vw+4#a+C4jn^R|1! zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mcawU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#zXO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7 zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^BE24?QEuNhTw$hV5 zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7 z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc z1`TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp z|0Fv`egG4xcCL?gAVB#dl9UeGZr^f7Js>{^F;Z*cl9 znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5 z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt zezsAuiP{ofwgf$|z#m1Zpr_Y+s;8s= zX-ld0yu28zwC7FgmoIYW^{`&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^ z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ ze)pJD2 zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0` zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?| zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|BY0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o z4U+j7r4uYBH>G_4Op3_kO z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xfPXn7!@ICCtW(kE? zC9u_OSs{<>4s2#9T4X&Cx<&N4oTpkZG(Ly zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R*LE83z!nesI@>8@N&BpGU-1t*vU&eE99vXkJn z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_* z=<(qn&}^?zwz+G8{h``ne8 zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf?? zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ovz;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg< z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d** zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~kQb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTxhaXjTHTk09IPq0}! zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55 z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7-&IFU%k-cX@}EYDW(jbSTiU; zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5 z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C* z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G& z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3 z^11uOcwV=iE!yv%gkz}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m# zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(an$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~ z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH= z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v0QeJ+SzF zDEl-W5_WywYl*+`24&HHdzIUh`aW_8k@UoVD78Q{W6RDjPK*l>4hnK zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl6FD~C^X)}>R+h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG z{}W9#n+BWB8nS7z*!3}KL$*27e9fNknzh%>N7H^y32%?fev zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7% zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y zm)E1`ni-BIP7BvBuOMy=NEUB~SD5pGH3NUgW zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo&e>RJARM}so`d+g* zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55 zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JAQaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`RU0;^c*s=Wt-e*9oE#UK3k z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K z#NPH^IsgXR?xC{ z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX zS-V>AG;uJ!8^?P3-l>1Uvv6*Z zOeTw6@Mp!MS=aAcSG;@WDlB$S9jF+?B<|W<*Ua#Q-9x?W z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1 ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7 zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pURV3jR0nY&1+CC z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8% zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27 zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+ zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0 zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n_rK|0Jfw{&qWKU#MO6SUa1eE(RG z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9 zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=uSmv#gB6|(p(ld9K|P^ zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66 zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl zQI}iI=O~ud)+DLZm zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2 z9KGe^MFW~5mGzusx_WLOfrI7L8}Bzc zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+ zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa zhaQJQqolRyA7g09$DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|ju~zL^ zO78FN)#30IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjjy?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w zGQ9g9yQ)cMRc2JwDT=5W zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z( zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx= z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!swfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r! zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58 z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M z54A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm-~ZSJwN_iOTloXp zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_5k%~3^ZbDb*f?fa)i!> zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9 z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e ztxeFe-|*hOD zIM@YH2MV(iG*rNIYjFDN0ZWg6wo4Db@ z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~ zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ zUx(7Cs`#dIg}nrG48lpMfaK#!L3}tzChl zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(c!f~JUb%q!1YUU^$Y9Yfv? z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv z1KutBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4 zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2! zd#(e)AbJ0Fwjr!wVHQmz#nmws5?agmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|% zD_b)&@O4JkCE@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r z;uHA7@#&eY5qQ17sqy_0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1* zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3iar%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l z<))}|JaVJ>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f z@f~YC3W9E@Gt!iwNUnmQ=d)JX1+ zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K= zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|! z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ# z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus z;~I;5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6 z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraCh21|sFabPB`K+jX9A~OM_$e7Ss z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i zwFPA}K=xxoniLpS6L=if0GU|4d;=vY-GTf z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;72$ zd0^h+{ua%kp+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7tksy@&4=3Fn@MBIw z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk> zyahXe%?j%@d^f6`wGRt!A8uWP_B4a zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{ zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0< zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KDoJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;; zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C* zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL z7KG})b6uSojYg%u(1rM0=6}{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g^eVx25m)tpui>mG74ML zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC19aC<= zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v zlG-Fx(qCWdhUt9`M%gJ-^eASkwW4YnS?~Y8J3Lw6WRQ z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@| z7;wJ&(lW|2VZsD6=PN(LC9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x< z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~ zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V< zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{ zwQTzgKtHg7wEWU06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4&r_tQ#Ek9v-QM@Z92|j?Wie# zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A% zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=L3#AgRgTA*-tOUe`c(_jlsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx- z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o# zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X_uaT&LqwQ0C-hfw7k z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKKz7Oz6L+QKy*Hg0(nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx)SGAVQ1Ip68YDkYpfQieS-8h+S0W zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU zHzGYA=?BfSgh4x}$4-GlUaJPbbyMqh$-3exM4 zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~* z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs) z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$H`1niP#%Z8e?sa;y5L@v zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh zX))5ZNPma46RB@YG}?ghyzq+9-u@<_8@MtK}=*C5S7 zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_* z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3 zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw- z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C* zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9 zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%` z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8 ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!= z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&>qErhJQ4z6tnW0$&<~zr@0?27VLpf zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6 z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRrcs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7 z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l zzgXG84cHpsHv;ePFT=D&xUFdC4&Yw{ewg0e6phY~DX$g{Y`^D$H`?OkcL9Gg z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7 zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa z&!2y*QaWq!3@1mieIKAoC?TWI`r zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3} z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!# z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr-1HfexOrj_5b`gt=p{&DF<#^r?O|C zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU2mwjA zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0 zA)vZGAmd%~xuqE#ve3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6 zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I= z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{VEjy?_*kdlmPjgdbC`!lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_ z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p* zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdPqdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw z9mO;6g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;os=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0VGqfEq>_(xD_22_R{rTk59m z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2 zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;< zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0 zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^ znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D zx3+7seOH;*zOTSrO}Pd(#{?)cEV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53p9L8os*dfM$IcB#qVb8x6) zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M; z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2 z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`= z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h z8t^-)q8k8wlPU^utNv&zvgWznTiOmeoe3#ik(xw7? zi^HLcI0zA`iVmyO(i}b}4z<%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj) zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZmOgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1) z%Lt|rHF^y8VONdRGe9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~ZA|XG?gi{bVAq}ZrTeiX37S# zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo z6>52R1E{(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@SOK@Go zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6 z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu> zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w?s%@S7CcX*gi_TYyYe=C^I_29qE_7wR* zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=0V{SI2PU_`9=FdRAz6X_r@bbj z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym} zGmgv6Nm=`-&*o#Z7Ar{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$ zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@II!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN zCK;~+{Sr@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65 zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR> zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h* zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U} zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2smtz@PrZS2kV+4E=}qHa?7Aq5tqptG!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*> zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c ziLq83dpIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!jlG8HH(Wx= zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62| z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6) zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJGHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h? zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{ejN4VF4tjAwm3XnF zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;( ztlTpSv>`;y=$I8{NDp!!I!4b5J<`ZCC}{)O4Y z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+BjW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q# z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8 z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR^(B?)u^H9}U*-(WcOuXVclfatIxv zp|`EzVh4$KYUClS`M;k=o=vas;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2 zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{ zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6 zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<) zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOIYFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9 zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@ zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@ z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l# z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H? z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7 zn>2+s?4mD{wwNl&K-?V>2!ZQeC}?sMce$tEZ24 zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O0Vo)&`oXe&0Ve9 z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)dJDVaxShcF(xr@t8JL2r;f zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*Me*Abu0QRdd=By;}e# zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGDBp;F_4hU14L2NT&@ykNiS35v0GP<{# z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9 z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828 z4yHz0&=)o7q2Y)~a?`FfB=F)u(3wnr#NW7ZOE6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy} zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=oymA3#i)t0=sI6iW(kt+8h;7 z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z zn=#^~I+-`f1u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3 za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1 zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<; zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7 z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2 zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh(=Ah6UD zl^rfaO7UNL`q-So}Nug;rxC{Q{9Qy5jq+4Px;XmQti9oGXZd{+&3| zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38 zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX zuK;~bPRH?lK9SFB_$)VdiLUZJc%~4J7zGIsbk>%j46> zIQdm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW** z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay` zo{hQormbts5jSE&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_} z8pg~1vq{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl z2LK#0oSp$}q2G&Ho;;na)C)LggBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3 z-!gvG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k zCxw!~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4 z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>dz&8=gM4L6 zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?) z=BckSpNr2lgy%5*eHzaR&*|A5_*aW}A zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC7y@DH*+=QI8l9w^&b z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ zfbIWXCYjIp-**~BA@fPh4Ae&2Y! zpZPy|qG8O_nEC=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe# z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O z9Q>~WehlJzx%p{`;8_egPI>Sfi z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93 zobhKf{v@`mZ6;fFGG1^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO; zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W}Te4Chb3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5 z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV zvr`Y|;PVXA7r$=k+5M}Z0p8T8#dLnEnEs=q z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+ z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ< zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2 z677gWUoW@kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0 z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR zCe7^i(ABoe;!~g`Zhd}Zr5>%RU%-S z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z zqQ0e1C|$|sSr|ZY` zYpTqw(bGwer-^Rq?cXGt@eKiPxKtNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7 zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d& z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P* z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1% zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~ z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D zCt}PyDO1$kIx!^#MhysC-wK&@aZF398nkF_OhUkn@X? zwBo8EJ*nkG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79 zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0 zgx-7yASw6!ua$ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" -#include "xgpu.h" - -#define N_INTS 128 - -// global variables -int DEBUG = 0; -const int n_all = 3194880; - -// to extract autocorrelation data -void auto_extract(float *output, float *specs); - -void auto_extract(float *output, float *specs) { - - int bctr = 0, idx, oidx = 0; - for (int a1=0;a1<63;a1++) { - for (int a2=0;a2<=a1;a2++) { - - if (a1==a2) { - for (int f=0;f<384;f++) { - for (int pol=0;pol<2;pol++) { - idx = 2*((bctr*384+f)*2+pol); - specs[oidx] += output[idx]; - } - oidx++; - } - } - bctr++; - - } - } - - -} - -// for extracting data -// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri) -void simple_extract(Complex *mat, float *output); - -void simple_extract(Complex *mat, float *output) { - - int in_idx, out_idx; - for (int bctr=0;bctr<2080;bctr++) { - for (int pol1=0;pol1<2;pol1++) { - - for (int f=0;f<384;f++) { - - out_idx = 2*((bctr*384+f)*2+pol1); - in_idx = (2*f*2080+bctr)*4+pol1*3; - output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real); - output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag); - - } - } - } - -} - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = TEST_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int arg = 0; - int output_specs = 0; - - while ((arg=getopt(argc,argv,"c:i:o:sdh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 's': - output_specs=1; - syslog (LOG_INFO, "Will output spectra files"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block; - uint64_t written, block_id; - Complex * cblock; - float *data = (float *)malloc(sizeof(float)*n_all); - - // spectra outputs - FILE *fout, *fmjd; - char fnam[100]; - float *specs = (float *)malloc(sizeof(float)*63*384); - float mjd; - int ctr = 0; - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - cblock = (Complex *)(block); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - - if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) { - syslog(LOG_ERR,"could not open fmjd"); - } - fscanf(fmjd,"%f",&mjd); - fclose(fmjd); - sprintf(fnam,"/home/ubuntu/data/specs_%f.dat",mjd); - - } - - // DO STUFF - from block to summed_vis - - if (DEBUG) syslog(LOG_DEBUG,"extracting..."); - simple_extract((Complex *)(block), data); - if (DEBUG) syslog(LOG_DEBUG,"extracted!"); - - // write to file if needed - if (output_specs==1) { - - if (ctr==0) - for (int i=0;i<63*384;i++) specs[i] = 0.; - - auto_extract(data, specs); - ctr += 1; - - if (ctr==N_INTS) { - fout = fopen(fnam,"a"); - for (int i=0;i<63*384;i++) - fprintf(fout, "%f\n", specs[i]); - fclose(fout); - ctr=0; - } - - } - - - // write to output - written = ipcio_write (hdu_out->data_block, (char *)data, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - for (int i=0;i<10;i++) { - syslog(LOG_INFO, "%g", data[i]); - printf("%g ", data[i]); - printf("\n"); - } - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(data); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_wrangleAndWrite.c b/src/dsaX_wrangleAndWrite.c deleted file mode 100644 index 6cd4a33..0000000 --- a/src/dsaX_wrangleAndWrite.c +++ /dev/null @@ -1,365 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" -#include "xgpu.h" - -// global variables -int DEBUG = 0; -const int n_all = 3194880; -const int nbl = 2080; - -// for lookup table generation -// index is position to extract from xgpu array to output (Greg-style) array -void gen_lookup(int * idx_xgpu_in_greg); -void gen_lookup(int * idx_xgpu_in_greg) { - - // get antenna order in xgpu - int xgpu_ant_1[nbl], xgpu_ant_2[nbl], ct=0; - for (int i=0;i<64;i++) { - for (int j=0;j<=i;j++) { - xgpu_ant_1[ct] = j; - xgpu_ant_2[ct] = i; - ct++; - } - } - - // get antenna order in Greg - int gh_ant_1[nbl], gh_ant_2[nbl]; - ct=0; - for (int i=0;i<64;i++) { - for (int j=i;j<64;j++) { - gh_ant_1[ct] = i; - gh_ant_2[ct] = j; - ct++; - } - } - - // match antenna orders - for (int i=0;i= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block; - uint64_t written, block_id; - Complex * cblock; - float *data = (float *)malloc(sizeof(float)*n_all); - - - // set up - - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - cblock = (Complex *)(block); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - from block to summed_vis - - if (DEBUG) syslog(LOG_DEBUG,"extracting..."); - simple_extract((Complex *)(block), data); - if (DEBUG) syslog(LOG_DEBUG,"extracted!"); - - // write to output - written = ipcio_write (hdu_out->data_block, (char *)data, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - for (int i=0;i<10;i++) { - syslog(LOG_INFO, "%g", data[i]); - printf("%g ", data[i]); - printf("\n"); - } - } - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(data); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dsaX_writeFil.c b/src/dsaX_writeFil.c deleted file mode 100644 index 751db9d..0000000 --- a/src/dsaX_writeFil.c +++ /dev/null @@ -1,486 +0,0 @@ -/* This works pretty much like the trigger code. receives a control UDP message -to store some data for a fixed amount of time. -Message format: length(s)-NAME -Will ignore messages until data recording is over -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include - - -FILE *output; - -void send_string(char *string) /* includefile */ -{ - int len; - len=strlen(string); - fwrite(&len, sizeof(int), 1, output); - fwrite(string, sizeof(char), len, output); -} - -void send_float(char *name,float floating_point) /* includefile */ -{ - send_string(name); - fwrite(&floating_point,sizeof(float),1,output); -} - -void send_double (char *name, double double_precision) /* includefile */ -{ - send_string(name); - fwrite(&double_precision,sizeof(double),1,output); -} - -void send_int(char *name, int integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(int),1,output); -} - -void send_char(char *name, char integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(char),1,output); -} - - -void send_long(char *name, long integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(long),1,output); -} - -void send_coords(double raj, double dej, double az, double za) /*includefile*/ -{ - if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); - if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); - if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); - if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); -} - - -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -int trignum = 0; -int dumpnum = 0; -char iP[100]; -char srcnam[1024]; -float reclen; -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); -void convert_block(char * b1, char * b2); - -void usage() -{ - fprintf (stdout, - "dsaX_image [options]\n" - " -c core bind process to CPU core\n" - " -b write one beam\n" - " -f filename base [default test.fil]\n" - " -k in_key [BF_BLOCK_KEY]\n" - " -i IP to listen to [no default]\n" - " -s integrate N ints MUST BE FACTOR OF 16384 [default 1]\n" - " -m get mjd from file\n" - " -d DEBUG\n" - " -h print usage\n"); -} - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -// Thread to control the dumping of data - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = WRITEVIS_CONTROL_PORT; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - float tmp_reclen; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - trignum++; - - // interpret buffer string - char * rest = buffer; - tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr)); - char * tmp_srcnam = strtok(NULL, "-"); - - if (!dump_pending) { - reclen = tmp_reclen; - strcpy(srcnam,tmp_srcnam); - syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam); - } - - if (dump_pending) - syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam); - - if (!dump_pending) dump_pending = 1; - - close(fd); - - } - - free (buffer); - - if (ctx->verbose) - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_writeFil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA defs */ - dada_hdu_t* hdu_in = 0; - multilog_t* log = 0; - key_t in_key = BF_BLOCK_KEY; - - /* actual struct with info */ - udpdb_t udpdb; - - // command line - int arg = 0; - int core = -1; - float fch1 = 1530.0; - char fnam[300], foutnam[400]; - sprintf(fnam,"/home/dsa/alltest"); - - // for getting MJD - FILE *fmjd; - int get_mjd = 0; - int sumi=1; - int onebeam=0; - - while ((arg=getopt(argc,argv,"c:f:o:i:k:s:bmdh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - printf ("ERROR: -c flag requires argument\n"); - return EXIT_FAILURE; - } - case 'k': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-k flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - strcpy(fnam,optarg); - break; - case 'i': - strcpy(iP,optarg); - break; - case 'd': - DEBUG=1; - break; - case 'b': - onebeam=1; - break; - case 'm': - get_mjd=1; - break; - case 's': - sumi = atoi(optarg); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // DADA stuff - - udpdb.verbose = 1; - - syslog (LOG_INFO, "dsaX_writefil: creating hdu"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"dsaX_writefil: could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"dsaX_writespec: could not lock to dada buffer"); - return EXIT_FAILURE; - } - - // Bind to cpu core - if (core >= 0) - { - syslog(LOG_INFO,"binding to core %d", core); - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"dsaX_writefil: failed to bind to core %d", core); - } - - int observation_complete=0; - - // more DADA stuff - deal with headers - - uint64_t header_size = 0; - - // read the headers from the input HDUs and mark as cleared - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "main: could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - - // start control thread - int rval = 0; - pthread_t control_thread_id; - syslog(LOG_INFO, "starting control_thread()"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - - // set up - int fctr = 0, integration = 0; - char tstamp[100]; - double mjd=55000.; - int rownum = 1; - int dfwrite = 0; - float mytsamp = 4.*8.*8.192e-6; - int NINTS, midx; - - // data stuff - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t bytes_read = 0, block_id; - char *block; - float *hoblock = (float *)malloc(sizeof(float)*64*1024*16384/sumi); - - // start things - - syslog(LOG_INFO, "dsaX_writespec: starting observation"); - int nblocks = 0; - - while (!observation_complete) { - - // read block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - if (DEBUG) for (int i=0;i<48;i++) syslog(LOG_INFO,"%hu",((unsigned char *)(block))[i]); - - for (int i=0;i<64*1024*16384/sumi;i++) hoblock[i] = 0.; - - // for writing sum - /* for (int i=0;i<256*48;i++) oblock[i] = 0.; - for (int i=0;i<128;i++) { - for (int j=0;j<256*48;j++) oblock[j] += (float)(block[i*256*48+j]); - }*/ - - syslog(LOG_INFO,"read block %d",nblocks); - - // check for dump_pending - if (dump_pending) { - - // if file writing hasn't started - if (dfwrite==0) { - - syslog(LOG_INFO, "beginning file write for SRC %s for %f s",srcnam,reclen); - - NINTS = (int)(floor(reclen/(mytsamp*16384.))); - //NINTS = (int)(floor(reclen/(0.134217728))); - sprintf(foutnam,"%s_%s_%d_%d.fil",fnam,srcnam,fctr,nblocks); - syslog(LOG_INFO, "main: opening new file %s",foutnam); - - if (!(output = fopen(foutnam,"wb"))) { - printf("Couldn't open output file\n"); - return 0; - } - - if (get_mjd==1) { - if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) { - syslog(LOG_ERR,"could not open fmjd"); - } - fscanf(fmjd,"%lf",&mjd); - mjd += nblocks*4.294967296/86400.; - fclose(fmjd); - } - - - send_string("HEADER_START"); - send_string("source_name"); - send_string(srcnam); - send_int("machine_id",1); - send_int("telescope_id",82); - send_int("data_type",1); // filterbank data - send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) - send_double("foff",-0.244140625); - send_int("nchans",1024); - if (sumi==1) send_int("nbits",8); - else send_int("nbits",32); - send_double("tstart",mjd); - send_double("tsamp",8.192e-6*8.*4.*sumi); - send_int("nifs",1); - send_string("HEADER_END"); - - syslog(LOG_INFO, "main: opened new file %s",foutnam); - - dfwrite=1; - - - } - - // write data to file - syslog(LOG_INFO,"writing"); - - - for (int i=0;i<64;i++) { - for (int j=0;j<16384/sumi;j++) { - for (int k=0;kdata_block, bytes_read); - nblocks += 1; - - } - - // close control thread - syslog(LOG_INFO, "joining control_thread"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - - free(hoblock); - dsaX_dbgpu_cleanup(hdu_in); - -} diff --git a/src/dsaX_writevis.c b/src/dsaX_writevis.c deleted file mode 100644 index 02cebb7..0000000 --- a/src/dsaX_writevis.c +++ /dev/null @@ -1,428 +0,0 @@ -/* This works pretty much like the trigger code. receives a control UDP message -to store some data for a fixed amount of time. -Message format: length(s)-NAME -Will ignore messages until data recording is over -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" -#include "fitsio.h" -#include "xgpu.h" - -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -int trignum = 0; -int dumpnum = 0; -char iP[100]; -char srcnam[1024]; -float reclen; -int DEBUG = 0; - -// assumes that only first 78 baselines are written and 384 channels and 2 pols -const int n = 9216; -float summed_vis[9216]; -const int n_all = 3194880; - -// for extracting data -// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri) -void simple_extract(Complex *mat, float *output); - -void simple_extract(Complex *mat, float *output) { - - int in_idx, out_idx; - for (int bctr=0;bctr<2080;bctr++) { - for (int pol1=0;pol1<2;pol1++) { - - for (int f=0;f<384;f++) { - - out_idx = 2*((bctr*384+f)*2+pol1); - in_idx = (2*f*2080+bctr)*4+pol1*3; - output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real); - output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag); - - } - } - } - -} - - - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); - -void usage() -{ - fprintf (stdout, - "dsaX_image [options]\n" - " -c core bind process to CPU core\n" - " -d debug [default no]\n" - " -k in_key [default XGPU_BLOCK_KEY]\n" - " -f filename base [default test.fits]\n" - " -o freq of chan 1 [default 1494.84375]\n" - " -i IP to listen to [no default]\n" - " -h print usage\n"); -} - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -// Thread to control the dumping of data - -void control_thread (void * arg) { - - udpdb_t * ctx = (udpdb_t *) arg; - syslog(LOG_INFO, "control_thread: starting"); - - // port on which to listen for control commands - int port = WRITEVIS_CONTROL_PORT; - char sport[10]; - sprintf(sport,"%d",port); - - // buffer for incoming command strings, and setup of socket - int bufsize = 1024; - char* buffer = (char *) malloc (sizeof(char) * bufsize); - memset(buffer, '\0', bufsize); - const char* whitespace = " "; - char * command = 0; - char * args = 0; - - struct addrinfo hints; - struct addrinfo* res=0; - memset(&hints,0,sizeof(hints)); - struct sockaddr_storage src_addr; - socklen_t src_addr_len=sizeof(src_addr); - hints.ai_family=AF_INET; - hints.ai_socktype=SOCK_DGRAM; - getaddrinfo(iP,sport,&hints,&res); - int fd; - ssize_t ct; - char tmpstr; - char cmpstr = 'p'; - char *endptr; - float tmp_reclen; - - syslog(LOG_INFO, "control_thread: created socket on port %d", port); - - while (!quit_threads) { - - fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - trignum++; - - // interpret buffer string - char * rest = buffer; - tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr)); - char * tmp_srcnam = strtok(NULL, "-"); - - if (!dump_pending) { - reclen = tmp_reclen; - strcpy(srcnam,tmp_srcnam); - syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam); - } - - if (dump_pending) - syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam); - - if (!dump_pending) dump_pending = 1; - - close(fd); - - } - - free (buffer); - - if (ctx->verbose) - syslog(LOG_INFO, "control_thread: exiting"); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_writevis", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA defs */ - dada_hdu_t* hdu_in = 0; - multilog_t* log = 0; - key_t in_key = XGPU_BLOCK_KEY; - - /* actual struct with info */ - udpdb_t udpdb; - - // command line - int arg = 0; - int core = -1; - float fch1 = 1500.0; - int nchans = 384; - char fnam[300], foutnam[400]; - sprintf(fnam,"/home/ubuntu/alltest"); - - while ((arg=getopt(argc,argv,"c:f:o:i:k:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - printf ("ERROR: -c flag requires argument\n"); - return EXIT_FAILURE; - } - case 'k': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-k flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - strcpy(fnam,optarg); - break; - case 'd': - DEBUG=1; - break; - case 'o': - fch1 = atof(optarg); - break; - case 'i': - strcpy(iP,optarg); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // DADA stuff - - udpdb.verbose = 1; - - syslog (LOG_INFO, "dsaX_writevis: creating hdu"); - - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"dsaX_writevis: could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"dsaX_writevis: could not lock to dada buffer"); - return EXIT_FAILURE; - } - - // Bind to cpu core - if (core >= 0) - { - syslog(LOG_INFO,"binding to core %d", core); - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"dsaX_writevis: failed to bind to core %d", core); - } - - int observation_complete=0; - - // more DADA stuff - deal with headers - - uint64_t header_size = 0; - - // read the headers from the input HDUs and mark as cleared - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "main: could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - - // start control thread - int rval = 0; - pthread_t control_thread_id; - syslog(LOG_INFO, "starting control_thread()"); - rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb); - if (rval != 0) { - syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval)); - return -1; - } - - // set up - int fctr = 0, integration = 0; - fitsfile *fptr; - int rownum = 1; - int fwrite = 0; - int status=0; - float mytsamp = 4096*4*8.192e-6; - int NINTS; - - // data stuff - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t bytes_read = 0, block_id; - char *block; - float *data = (float *)malloc(sizeof(float)*n_all); - int si1, si2; - int nblocks = 0; - Complex * cblock; - - // start things - - syslog(LOG_INFO, "dsaX_writevis: starting observation"); - - while (!observation_complete) { - - // read block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - cblock = (Complex *)(block); - - if (DEBUG) { - if (nblocks==20) { - for (int i=100;i<200;i++) { - syslog(LOG_DEBUG,"MAT %d %f %f",i,(float)(cblock[i].real),(float)(cblock[i].imag)); - } - } - } - - // DO STUFF - from block to summed_vis - - if (DEBUG) syslog(LOG_DEBUG,"extracting..."); - simple_extract((Complex *)(block), data); - for (int i=0;idata_block, bytes_read); - nblocks++; - - if (DEBUG) syslog(LOG_DEBUG,"Finished block %d",nblocks); - - } - - // close control thread - syslog(LOG_INFO, "joining control_thread"); - quit_threads = 1; - void* result=0; - pthread_join (control_thread_id, &result); - - free(data); - dsaX_dbgpu_cleanup(hdu_in); - -} diff --git a/src/dsaX_xgpu.cu b/src/dsaX_xgpu.cu deleted file mode 100644 index d065848..0000000 --- a/src/dsaX_xgpu.cu +++ /dev/null @@ -1,375 +0,0 @@ -// -*- c++ -*- -/* will run xgpu */ -/* assumes input block size is appropriate */ -#define THRUST_IGNORE_CUB_VERSION_CHECK - -#include -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -//#include "dada_cuda.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include "cube/cube.h" -#include "xgpu.h" - -/* global variables */ -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -// kernel for fluffing -// run with 6291456 blocks of 32 threads -__global__ void promoter(char *input, char *output) { - - int idx = blockIdx.x*32 + threadIdx.x; - char v = input[idx]; - - //output[2*idx] = ((v<<4) & 240) >> 4; - //output[2*idx+1] = v >> 4; - output[2*idx] = (char)(((unsigned char)(v) & (unsigned char)(15)) << 4) >> 4; - output[2*idx+1] = (char)(((unsigned char)(v) & (unsigned char)(240))) >> 4; - -} - -void usage() -{ -fprintf (stdout, - "dsaX_xgpu [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -h print usage\n"); -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_xgpu", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = REORDER_BLOCK_KEY; - key_t out_key = XGPU_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block; - char * output_buffer; - output_buffer = (char *)malloc(sizeof(char)*block_out); - uint64_t written, block_id; - - - // set up xgpu - - // register input hdu with gpu - //dada_cuda_dbregister(hdu_in); - - // structures and definitions - XGPUInfo xgpu_info; - int syncOp = SYNCOP_DUMP; - int xgpu_error = 0; - xgpuInfo(&xgpu_info); - XGPUContext context; - context.array_h = NULL; - context.matrix_h = NULL; - xgpu_error = xgpuInit(&context, 0); - if(xgpu_error) { - syslog(LOG_ERR, "xGPU error %d", xgpu_error); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - ComplexInput *array_h = context.array_h; // this is pinned memory - Complex *cuda_matrix_h = context.matrix_h; - memset((char *)array_h,0,2*context.array_len); - - syslog(LOG_INFO,"Set up xgpu with input size %d output size %d",context.array_len,context.matrix_len); - - // set up data input for fluffing - char * h_din = (char *)malloc(sizeof(char)*context.array_len); - char *d_din, *d_dout; - cudaMalloc((void **)&d_din, context.array_len*sizeof(char)); - cudaMalloc((void **)&d_dout, 2*context.array_len*sizeof(char)); - - // do prestart - syslog(LOG_INFO, "pre-starting..."); - char * tmp_data = (char *)malloc(sizeof(char)*context.array_len); - memset(tmp_data, 1, context.array_len); - for (int i=0;i<10;i++) { - - cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice); - promoter<<<6291456,32>>>(d_din,d_dout); - //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); - xgpu_error = xgpuCudaXengine(&context, syncOp); - xgpuClearDeviceIntegrationBuffer(&context); - - } - - free(tmp_data); - syslog(LOG_INFO, "finished with pre-start"); - - // get things started - bool observation_complete=0; - bool started = 0; - syslog(LOG_INFO, "starting observation"); - int blocks = 0; - - while (!observation_complete) { - - if (DEBUG) syslog(LOG_DEBUG,"reading block"); - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // DO STUFF - - for (int myint=0;myint>>(d_din,d_dout); - //cudaMemcpy((char *)(array_h),d_dout,2*context.array_len*sizeof(char),cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - - // run xgpu - //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp); - xgpu_error = xgpuCudaXengine(&context, syncOp); - if(xgpu_error) { - syslog(LOG_ERR, "xGPU error %d\n", xgpu_error); - return EXIT_FAILURE; - } - - if (started==0 && blocks==20) { - syslog(LOG_INFO,"now in RUN state"); - if (DEBUG) { - for (int i=100;i<200;i++) { - syslog(LOG_DEBUG,"INPUT %hhi %hhi",array_h[i].real,array_h[i].imag); - syslog(LOG_DEBUG,"OUTPUT %g %g",(float)(cuda_matrix_h[i].real),(float)(cuda_matrix_h[i].imag)); - } - } - started=1; - } - - // clear device - xgpuClearDeviceIntegrationBuffer(&context); - - // write to output - - written = ipcio_write (hdu_out->data_block, (char *)(cuda_matrix_h), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - } - - // finish up - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - // finish up - free(output_buffer); - free(h_din); - cudaFree(d_din); - cudaFree(d_dout); - //dada_cuda_dbunregister(hdu_in); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} - - diff --git a/src/dumpfil.c b/src/dumpfil.c deleted file mode 100644 index 0be913c..0000000 --- a/src/dumpfil.c +++ /dev/null @@ -1,294 +0,0 @@ -//E_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" - -// global variables -int DEBUG = 0; - -void usage() -{ - fprintf (stdout, - "dumpfil [options]\n" - " -d send debug messages to syslog\n" - " -p no header\n" - " -f file to dump to [default none]\n" - " -n blocks to dump [default 30]\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -g ignore first block\n" - " -h print usage\n"); -} - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in); - - -void dsaX_dbgpu_cleanup (dada_hdu_t * in) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - -} - -FILE *output; - -void send_string(char *string) /* includefile */ -{ - int len; - len=strlen(string); - fwrite(&len, sizeof(int), 1, output); - fwrite(string, sizeof(char), len, output); -} - -void send_float(char *name,float floating_point) /* includefile */ -{ - send_string(name); - fwrite(&floating_point,sizeof(float),1,output); -} - -void send_double (char *name, double double_precision) /* includefile */ -{ - send_string(name); - fwrite(&double_precision,sizeof(double),1,output); -} - -void send_int(char *name, int integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(int),1,output); -} - -void send_char(char *name, char integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(char),1,output); -} - - -void send_long(char *name, long integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(long),1,output); -} - -void send_coords(double raj, double dej, double az, double za) /*includefile*/ -{ - if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); - if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); - if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); - if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); -} - - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dumpfil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - - // data block HDU keys - key_t in_key = 0x0000aaae; - - // command line arguments - char fnam[100]; - sprintf(fnam,"/home/ubuntu/dumpfil.fil"); - int nbl = 30; - int arg = 0; - int nhd = 0; - int igblock = 0; - - while ((arg=getopt(argc,argv,"f:i:n:pdgh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - strcpy(fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'n': - if (optarg) - { - nbl = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-n flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'p': - nhd=1; - syslog (LOG_INFO, "Will not write a header"); - break; - case 'g': - igblock=1; - syslog (LOG_INFO, "Will ignore first block"); - break; - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - syslog(LOG_INFO,"will use %d blocks",nbl); - - // DADA stuff - - syslog (LOG_INFO, "creating in hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in); - return EXIT_FAILURE; - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - syslog(LOG_INFO, "main: have input block size %lu\n",block_size); - uint64_t bytes_read = 0; - uint64_t npackets = 1; - char * block, * output_buffer; - uint64_t written, block_id; - - // fill output buffer if file exists - output=fopen(fnam,"wb"); - if(output == NULL) - { - syslog(LOG_ERR,"Error opening file"); - exit(1); - } - - if (!nhd) { - send_string("HEADER_START"); - send_string("source_name"); - send_string("TESTSRC"); - send_int("machine_id",1); - send_int("telescope_id",82); - send_int("data_type",1); // filterbank data - send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) - send_double("foff",-0.244140625); - send_int("nchans",1024); - send_int("nbits",8); - send_double("tstart",55000.0); - send_double("tsamp",8.192e-6*8.*16.); - send_int("nifs",1); - send_string("HEADER_END"); - } - - int observation_complete=0; - int blocks = 0, started = 0; - - syslog(LOG_INFO, "starting observation"); - - - while (blocks < nbl) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (!igblock || started!=0) { - fwrite(block, sizeof(char), bytes_read, output); - blocks++; - } - - if (started==0) started=1; - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - fclose(output); - dsaX_dbgpu_cleanup (hdu_in); - -} diff --git a/src/fil2dada.c b/src/fil2dada.c deleted file mode 100644 index c49f2b5..0000000 --- a/src/fil2dada.c +++ /dev/null @@ -1,521 +0,0 @@ -//E_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -//#include "ascii_header.h" -//#include "dsaX_capture.h" -//#include "dsaX_def.h" - -// global variables -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - -/* read fil file header variables */ -char rawdatafile[80], source_name[80]; -int machine_id, telescope_id, data_type, nchans, nbits, nifs, scan_number, - barycentric,pulsarcentric; /* these two added Aug 20, 2004 DRL */ -double tstart,mjdobs,tsamp,fch1,foff,refdm,az_start,za_start,src_raj,src_dej; -double gal_l,gal_b,header_tobs,raw_fch1,raw_foff; -int nbeams, ibeam; -/* added 20 December 2000 JMC */ -double srcl,srcb; -double ast0, lst0; -long wapp_scan_number; -char project[8]; -char culprits[24]; -double analog_power[2]; -/* added frequency table for use with non-contiguous data */ -double frequency_table[4096]; /* note limited number of channels */ -long int npuls; /* added for binary pulse profile format */ - - -int nbins; -double period; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) -{ - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -/* -void get_string(FILE *inputfile, int *nbytes, char string[]) -{ - int nchar; - size_t nRead; - strcpy(string,"ERROR"); - nRead = fread(&nchar, sizeof(int), 1, inputfile); - if (feof(inputfile)) exit(0); - if (nchar>80 || nchar<1) return; - *nbytes=sizeof(int); - nRead = fread(string, nchar, 1, inputfile); - string[nchar]='\0'; - *nbytes+=nchar; -} -*/ - -int read_header(FILE *inputfile); -/* -int read_header(FILE *inputfile) -{ - size_t nRead; - char string[80], message[80]; - int itmp,nbytes,totalbytes,expecting_rawdatafile=0,expecting_source_name=0; - int expecting_frequency_table=0,channel_index; - - - - get_string(inputfile,&nbytes,string); - if (!strcmp(string,"HEADER_START")) - rewind(inputfile); - return 0; - } - totalbytes=nbytes; - - while (1) { - get_string(inputfile,&nbytes,string); - if (strcmp(string,"HEADER_END")) break; - totalbytes+=nbytes; - if (strcmp(string,"rawdatafile")) { - expecting_rawdatafile=1; - } else if (strcmp(string,"source_name")) { - expecting_source_name=1; - } else if (strcmp(string,"FREQUENCY_START")) { - expecting_frequency_table=1; - channel_index=0; - } else if (strcmp(string,"FREQUENCY_END")) { - expecting_frequency_table=0; - } else if (strcmp(string,"az_start")) { - nRead = fread(&az_start,sizeof(az_start),1,inputfile); - totalbytes+=sizeof(az_start); - } else if (strcmp(string,"za_start")) { - nRead = fread(&za_start,sizeof(za_start),1,inputfile); - totalbytes+=sizeof(za_start); - } else if (strcmp(string,"src_raj")) { - nRead = fread(&src_raj,sizeof(src_raj),1,inputfile); - totalbytes+=sizeof(src_raj); - } else if (strcmp(string,"src_dej")) { - nRead = fread(&src_dej,sizeof(src_dej),1,inputfile); - totalbytes+=sizeof(src_dej); - } else if (strcmp(string,"tstart")) { - nRead = fread(&tstart,sizeof(tstart),1,inputfile); - totalbytes+=sizeof(tstart); - } else if (strcmp(string,"tsamp")) { - nRead = fread(&tsamp,sizeof(tsamp),1,inputfile); - totalbytes+=sizeof(tsamp); - } else if (strcmp(string,"period")) { - nRead = fread(&period,sizeof(period),1,inputfile); - totalbytes+=sizeof(period); - } else if (strcmp(string,"fch1")) { - nRead = fread(&fch1,sizeof(fch1),1,inputfile); - totalbytes+=sizeof(fch1); - } else if (strcmp(string,"fchannel")) { - nRead = fread(&frequency_table[channel_index++],sizeof(double),1,inputfile); - totalbytes+=sizeof(double); - fch1=foff=0.0; - } else if (strcmp(string,"foff")) { - nRead = fread(&foff,sizeof(foff),1,inputfile); - totalbytes+=sizeof(foff); - } else if (strcmp(string,"nchans")) { - nRead = fread(&nchans,sizeof(nchans),1,inputfile); - totalbytes+=sizeof(nchans); - } else if (strcmp(string,"telescope_id")) { - nRead = fread(&telescope_id,sizeof(telescope_id),1,inputfile); - totalbytes+=sizeof(telescope_id); - } else if (strcmp(string,"machine_id")) { - nRead = fread(&machine_id,sizeof(machine_id),1,inputfile); - totalbytes+=sizeof(machine_id); - } else if (strcmp(string,"data_type")) { - nRead = fread(&data_type,sizeof(data_type),1,inputfile); - totalbytes+=sizeof(data_type); - } else if (strcmp(string,"ibeam")) { - nRead = fread(&ibeam,sizeof(ibeam),1,inputfile); - totalbytes+=sizeof(ibeam); - } else if (strcmp(string,"nbeams")) { - nRead = fread(&nbeams,sizeof(nbeams),1,inputfile); - totalbytes+=sizeof(nbeams); - } else if (strcmp(string,"nbits")) { - nRead = fread(&nbits,sizeof(nbits),1,inputfile); - totalbytes+=sizeof(nbits); - } else if (strcmp(string,"barycentric")) { - nRead = fread(&barycentric,sizeof(barycentric),1,inputfile); - totalbytes+=sizeof(barycentric); - } else if (strcmp(string,"pulsarcentric")) { - nRead = fread(&pulsarcentric,sizeof(pulsarcentric),1,inputfile); - totalbytes+=sizeof(pulsarcentric); - } else if (strcmp(string,"nbins")) { - nRead = fread(&nbins,sizeof(nbins),1,inputfile); - totalbytes+=sizeof(nbins); - } else if (strcmp(string,"nsamples")) { - nRead = fread(&itmp,sizeof(itmp),1,inputfile); - totalbytes+=sizeof(itmp); - } else if (strcmp(string,"nifs")) { - nRead = fread(&nifs,sizeof(nifs),1,inputfile); - totalbytes+=sizeof(nifs); - } else if (strcmp(string,"npuls")) { - nRead = fread(&npuls,sizeof(npuls),1,inputfile); - totalbytes+=sizeof(npuls); - } else if (strcmp(string,"refdm")) { - nRead = fread(&refdm,sizeof(refdm),1,inputfile); - totalbytes+=sizeof(refdm); - } else if (expecting_rawdatafile) { - strcpy(rawdatafile,string); - expecting_rawdatafile=0; - } else if (expecting_source_name) { - strcpy(source_name,string); - expecting_source_name=0; - } else { - sprintf(message,"read_header - unknown parameter: %s\n",string); - fprintf(stderr,"ERROR: %s\n",message); - exit(1); - } - } - - - totalbytes+=nbytes; - - return totalbytes; -} -*/ - -void usage() -{ - fprintf (stdout, - "dsaX_fake [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -f file to read packet from [default none]\n" - " -i in_key [default TEST_BLOCK_KEY]\n" - " -o out_key [default REORDER_BLOCK_KEY2]\n" - " -n will not read header\n" - " -b number of blocks to stop after\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = 0x0000dada; - key_t out_key = 0x0000caca; - - // command line arguments - int core = -1; - int useZ = 1; - char fnam[100]; - int arg = 0; - int rhead = 1; - int nblocks = -1; - - while ((arg=getopt(argc,argv,"c:f:i:o:nb:dh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - useZ = 0; - strcpy(fnam,optarg); - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'b': - if (optarg) - { - nblocks = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-b flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'n': - rhead=0; - syslog (LOG_INFO, "Will not read header"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - uint64_t npackets = 1; - char * block, * output_buffer; - char * packet; - packet = (char *)malloc(sizeof(char)*block_size); - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,0,block_out); - uint64_t written, block_id; - - // fill output buffer if file exists - FILE *fin; - if (!useZ) { - - if (!(fin=fopen(fnam,"rb"))) { - syslog(LOG_ERR, "cannot open file - will write zeros"); - } - else { - - // DMH: FIXME - //if (rhead) read_header(fin); - - // fread(packet,block_out,1,fin); - // fclose(fin); - - // syslog(LOG_INFO,"Read packet, npackets %llu",npackets); - - // for (int i=0;idata_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - // no need to do anything here - output_buffer is ready to go - - // fread goes here - // count blocks, increment, stop loop and reopen file (or rewind) - - // write to output - written = ipcio_write (hdu_out->data_block, packet, block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) { - syslog(LOG_DEBUG, "written block %d",blocks); - } - blocks++; - - if (blocks==nblocks) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - fclose(fin); - free(packet); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} diff --git a/src/flagger.c b/src/flagger.c deleted file mode 100644 index 5262015..0000000 --- a/src/flagger.c +++ /dev/null @@ -1,484 +0,0 @@ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" - -#define NTIMES_P 4096 // # of time samples (assuming 1ms sampling period) -#define NCHAN_P 1024 // # of channels on BF node side -#define NBEAMS_P 64 // # of beams on BF side -#define M_P NTIMES_P -#define N_P 32 -#define HDR_SIZE 4096 -#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet - -// global variables -int DEBUG = 0; -double skarray[NBEAMS_P*NCHAN_P+1]; // array with SK values -- size NCHANS * NBEAMS -double avgspec[NBEAMS_P*NCHAN_P+1]; // spectrum over all beams to estimate median filter -double baselinecorrec[NBEAMS_P*NCHAN_P+1]; // spectrum over all beams to estimate median filter -int cores[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25}; - -void swap(char *p,char *q) { - char t; - - t=*p; - *p=*q; - *q=t; -} - -double medval(double a[],int n) { - int i,j; - char tmp[n]; - for (i = 0;i < n;i++) - tmp[i] = a[i]; - - for(i = 0;i < n-1;i++) { - for(j = 0;j < n-i-1;j++) { - if(tmp[j] > tmp[j+1]) - swap(&tmp[j],&tmp[j+1]); - } - } - return tmp[(n+1)/2-1]; -} - -/* THREAD FUNCTION */ - -struct data { - unsigned char * indata; - double * inSK; - unsigned char * output; - int cnt; - double nThreshUp; - int n_threads; - int thread_id; - int debug; -}; - -void noise_inject(void *args) { - - struct data *d = args; - int thread_id = d->thread_id; - int dbg = d->debug; - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - - // noise injection - - unsigned char *indata = (unsigned char *)d->indata; - double *inSK = (double *)d->inSK; - unsigned char *output = (unsigned char *)d->output; - int * cnt = (int *)d->cnt; - double nThreshUp = (double)d->nThreshUp; - int nthreads = d->n_threads; - int i, j, k; - - // copy from input to output - //memcpy(output,indata,(NBEAMS_P/nthreads)*NTIMES_P*NCHAN_P); - - //cnt[thread_id] = 0; - - for (i = 0; i < (int)(NBEAMS_P/nthreads); i++){ - for (k = 0; k < NCHAN_P; k++){ - if (inSK[i*(int)(NCHAN_P) + k] > nThreshUp){ - cnt[thread_id]++; - //if (dbg) syslog(LOG_DEBUG,"thread %d: flagging %d %d: sk %g",thread_id,i,k,inSK[i*(int)(NCHAN_P) + k]); - //for (j = 0; j < NTIMES_P; j++){ - //output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.); - //indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * 1. / ( (double)RAND_MAX ) + 10.); - //} - - // copy from lookup table - for (j = 0; j < NTIMES_P; j++) - indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = output[k*NTIMES_P+j]; - - } - /*else{ - for (j = 0; j < NTIMES_P; j++){ - output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]; - } - }*/ - } - } - - - - if (dbg) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id); - int thread_result = 0; - pthread_exit((void *) &thread_result); -} - -/* END THREAD FUNCTION */ - -void usage() -{ - fprintf (stdout, - "flagger [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default dada]\n" - " -o out_key [default caca]\n" - " -n use noise generation rather than zeros\n" - " -t SK threshold [default 5.0]\n" - " -b compute and apply baseline correction\n" - " -h print usage\n"); -} - - -int main(int argc, char**argv) -{ - - // syslog start - openlog ("flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // threads initialization - int nthreads = 16; - pthread_t threads[nthreads]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - // read command line args - - // data block HDU keys - key_t in_key = 0x0000dada; - key_t out_key = 0x0000caca; - - // command line arguments - int core = -1; - int arg = 0; - int noise = 0; - double skthresh = 5.0; - int bcorr = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:bndh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - skthresh = atof(optarg); - syslog(LOG_INFO,"modified SKTHRESH to %g",skthresh); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'n': - noise=1; - syslog (LOG_INFO, "Will generate noise samples"); - break; - case 'b': - bcorr=1; - syslog (LOG_INFO, "Will calculate and apply baseline correction"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // CONNECT AND READ FROM BUFFER - - dada_hdu_t* hdu_in = 0; // header and data unit - uint64_t blocksize = NTIMES_P*NCHAN_P*NBEAMS_P; // size of buffer - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to input buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to input buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - // read the header from the input HDU - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - - // mark the input header as cleared - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){ - syslog (LOG_ERR,"could not mark header as cleared"); - return EXIT_FAILURE; - } - - uint64_t block_id, bytes_read = 0; - unsigned char *in_data; - char *cin_data; - - // OUTPUT BUFFER - dada_hdu_t* hdu_out = 0; - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"flagged_data: could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write (hdu_out) < 0) { - syslog (LOG_ERR,"flagged_data: could not lock to dada buffer"); - return EXIT_FAILURE; - } - - /* //read fake header for now - char head_dada[4096]; - FILE *f = fopen("/home/dsa/dsa110-xengine/src/correlator_header_dsaX.txt", "rb"); - fread(head_dada, sizeof(char), 4096, f); - fclose(f); */ - - //// OUTPUT BUFFER - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - header_size = HDR_SIZE; - if (!header_out) - { - syslog(LOG_ERR,"couldn't read header_out"); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - return EXIT_FAILURE; - } - uint64_t written=0; - - //////////////// - - double S1 = 0; - double S2 = 0; - double sampval; - double nThreshUp = skthresh; // Threshold to apply to SK (empirical estimation) - struct data args[16]; - int * flag_counts = (int *)malloc(sizeof(int)*nthreads); - //unsigned char * output = (unsigned char *)malloc(sizeof(char)*NBEAMS_P*NCHAN_P*NTIMES_P); - int nFiltSize = 21; - int cnt = 0; - - // make array of random numbers - unsigned char * lookup_rand = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P); - for (int i=0;idata_block, &bytes_read, &block_id); - in_data = (unsigned char *)(cin_data); - - // compute SK and averaged spectrum - S1 = 0; - S2 = 0; - sampval = 0; - - for (int i = 0; i < NBEAMS_P; i++){ - for (int k = 0; k < NCHAN_P; k++){ - for (int j = 0; j < NTIMES_P; j++){ - sampval = (double)in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]; - avgspec[i*(int)(NCHAN_P) + k] += sampval / NTIMES_P; - S1 += sampval; - S2 += sampval * sampval; - skarray[i*(int)(NCHAN_P) + k] = (double)((M_P*N_P+1) / (M_P-1) * ( (M_P*S2)/(S1*S1) - 1 )); - } - S1 = 0; - S2 = 0; - } - } - if (DEBUG) syslog (LOG_DEBUG,"has computed SK."); - if (DEBUG) syslog(LOG_DEBUG,"example SK value : %g", (double)skarray[10]); - - // compute baseline correction - if (bcorr) { - for (int i = 0; i < NBEAMS_P*NCHAN_P-nFiltSize; i++) - baselinecorrec[i] = medval(&avgspec[i],nFiltSize); - } - - - // compare SK values to threshold and - // replace thresholded channels with noise or 0 - - if (noise){ - - for (int i=0;i nThreshUp){ - cnt++; - for (int j = 0; j < NTIMES_P; j++){ - in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = 0; - } - } - } - } - } - syslog (LOG_INFO,"%d channels*baselines flagged",cnt); - - // apply baseline correction - if (bcorr) { - for (int i = 0; i < NBEAMS_P; i++){ - for (int k = 0; k < NCHAN_P; k++){ - for (int j = 0; j < NTIMES_P; j++){ - //in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] / (unsigned char)baselinecorrec[i*(int)NCHAN_P+k]); - in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)((double)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]) / baselinecorrec[i*(int)NCHAN_P+k]); - } - } - } - - syslog (LOG_DEBUG,"baseline correction applied"); - } - - // close block after reading - ipcio_close_block_read (hdu_in->data_block, bytes_read); - if (DEBUG) syslog(LOG_DEBUG,"closed read block"); - - written = ipcio_write (hdu_out->data_block, (char *)(in_data), BUF_SIZE); - if (written < BUF_SIZE) - { - syslog(LOG_ERR,"write error"); - return EXIT_FAILURE; - } - - if (DEBUG) syslog (LOG_DEBUG,"write flagged data done."); - - - } - - free(lookup_rand); - return 0; -} diff --git a/src/gpu_flagger.cu b/src/gpu_flagger.cu deleted file mode 100644 index 07e6f5c..0000000 --- a/src/gpu_flagger.cu +++ /dev/null @@ -1,1547 +0,0 @@ -// -*- c++ -*- -/*#include -#include -#include -#include -1;95;0c#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -*/ -#include -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" - -#include -#include - - -#define NTIMES_P 16384 // # of time samples (assuming 1ms sampling period) -#define NCHAN_P 1024 // # of channels on BF node side -#define NBEAMS_P 64 // # of beams on BF side -#define M_P NTIMES_P -#define N_P 32 -#define HDR_SIZE 4096 -#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet -#define NTHREADS_GPU 32 -#define MN 48.0 -#define SIG 6.0 -#define RMAX 16384 -//#define NPERMFLAGS 58 -#define NPERMFLAGS 1 -#define TBIN 128 -#define FBIN 8 - -// global variables -int DEBUG = 0; -//int flagchannels[58] = {737,738,753,754,721,722,723,724,725,726,727,728,729,627,628,629,630,631,632,633,634,603,604,605,606,607,608,609,610,578,579,580,581,582,583,584,585,590,591,592,593,594,595,596,597,598,680,681,682,683,684,685,686,687,688,327,328,329}; -int flagchannels[1] = {10}; -/* global variables */ -int quit_threads = 0; -int dump_pending = 0; -int trignum = 0; -char iP[100]; -char footer_buf[1024]; -char flnam[1024]; -int dumpbm; - -// structure for pulse injection -typedef struct { - - int verbose; - float * block; - -} dsaX_pulse_t; - - - - -// kernel to calculate median spectrum -// only works on =naver && thread_id<2*naver) { - - tid=thread_id-naver; - vec[thread_id] = v0[tid*NBEAMS_P*NCHAN_P + block_id]; - - } - - __syncthreads(); - - if (thread_id=naver && thread_id<2*naver) { - for (int i=naver;i<2*naver;i++) { - if (i!=thread_id) { - if (vec[i]<=vec[thread_id]) ct_lt++; - } - } - } - - __syncthreads(); - - - if (thread_id=naver && thread_id<2*naver) - if (ct_lt==place) v0[block_id] = vec[thread_id]; - -} - -// kernel to calculate mean spectrum -// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads -__global__ -void calc_spectrum(unsigned char *data, float * spectrum) { - - int block_id = blockIdx.x; - int thread_id = threadIdx.x; - __shared__ float csum[NTHREADS_GPU]; - csum[thread_id] = 0.; - - int bm =(int)( block_id/NCHAN_P); - int ch = (int)(block_id % (NCHAN_P)); - int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); - - // find sum of local times - int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; - for (int tm=0; tm0) { - csum[thread_id] += csum[thread_id+act_maxn]; - act_maxn = (int)(act_maxn/2); - } - } - */ - - if (thread_id==0) { - spectrum[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P); - } - -} - - -// kernel to calculate variance spectrum -// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads -__global__ -void calc_varspec(unsigned char *data, float * spectrum, float * varspec) { - - int block_id = blockIdx.x; - int thread_id = threadIdx.x; - __shared__ float csum[NTHREADS_GPU]; - csum[thread_id] = 0.; - - int bm =(int)( block_id/NCHAN_P); - int ch = (int)(block_id % (NCHAN_P)); - int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); - float val; - - // find sum of local times - int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; - for (int tm=0; tm0) { - csum[thread_id] += csum[thread_id+act_maxn]; - act_maxn = (int)(act_maxn/2); - } - }*/ - - if (thread_id==0) { - varspec[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P); - } - -} - -// kernel to calculate maximum value -// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads -__global__ -void calc_maxspec(unsigned char *data, float * maxspec) { - - int block_id = blockIdx.x; - int thread_id = threadIdx.x; - __shared__ float csum[NTHREADS_GPU]; - csum[thread_id] = 0.; - - int bm =(int)( block_id/NCHAN_P); - int ch = (int)(block_id % (NCHAN_P)); - int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU)); - float val=0.; - - // find max of local times - int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch; - for (int i=idx0;ival) val = (float)(data[i]); - } - csum[thread_id] = val; - - __syncthreads(); - - // sum into shared memory - int maxn = NTHREADS_GPU/2; - int act_maxn = maxn; - if (thread_id0) { - if (csum[thread_id]val) val = vv; - } - csum[thread_id] = val; - - __syncthreads(); - - // sum into shared memory - int maxn = NTHREADS_GPU/2; - int act_maxn = maxn; - float v1; - if (thread_id0) { - if (csum[thread_id]0) { - if (csum[thread_id]>csum[thread_id+act_maxn]) - csum[thread_id]=csum[thread_id+act_maxn]; - act_maxn = (int)(act_maxn/2); - } - } - if (thread_id==0) - ppspec[bm*NCHAN_P+ch] = v1-csum[thread_id]; - -} - - -// kernel to scale data -// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads -__global__ -void scaley(unsigned char *data, float *spectrum, float *varspec) { - - int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; - int bm = (int)(idx / (NTIMES_P*NCHAN_P)); - int ch = (int)(idx % NCHAN_P); - int spidx = bm*NCHAN_P+ch; - - float val = (float)(data[idx]); - val = (val-spectrum[spidx])*(SIG/sqrtf(varspec[spidx])) + MN; - data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2); - - -} - -// kernel to add pulse to data -// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads -__global__ -void sumpulse(unsigned char *data, float *summand) { - - int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; - float val = (float)(data[idx]); - val += summand[idx]; - data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2); - -} - - - - -// kernel to make time series from data -// run with NBEAMS_P*NTIMES_P blocks of 32 threads -__global__ -void make_ts(unsigned char *data, float *ts) { - - int block_id = blockIdx.x; - int thread_id = threadIdx.x; - int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x; - int bm = (int)(blockIdx.x/NTIMES_P); - int tm = (int)(blockIdx.x % NTIMES_P); - int ch0 = (int)(thread_id*(NCHAN_P/NTHREADS_GPU)); - - __shared__ float csum[NTHREADS_GPU]; - csum[thread_id] = 0.; - - // find sum of local chans - int idx0 = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch0; - for (int ch=0; chthresh) mask[i] = 1; - } - -} - - - -float medval(float *a,int n); - -float medval(float *a,int n) { - int i,j; - float tmp[n], tt; - for (i = 0;i < n;i++) - tmp[i] = a[i]; - - for(i = 0;i < n-1;i++) { - for(j = 0;j < n-i-1;j++) { - if(tmp[j] > tmp[j+1]) { - - tt = tmp[j+1]; - tmp[j+1] = tmp[j]; - tmp[j] = tt; - - } - } - } - - return tmp[(int)((n+1)/2-1)]; -} - -void channflag(float* spec, float Thr, int * mask); -void simple_channflag(float* spec, float Thr, int * mask); -void simple_tsflag(float* ts, float Thr, int * mask); - -void simple_channflag(float* spec, float Thr, int * mask) { - - int i, j; - float* medspec; // median values for each beam spectrum - float* madspec; // mad for each beam spectrum - float* normspec; // corrected spec - median value (for MAD calculation) - - medspec = (float *)malloc(sizeof(float)*NBEAMS_P); - madspec = (float *)malloc(sizeof(float)*NBEAMS_P); - normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - - int ZeroChannels = 128; - int nFilt, idx; - - // calculate median value for each beam - for (i = 0; i < NBEAMS_P; i++) - medspec[i] = medval(spec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels); - - // compute MAD for each beam - for (i = 0; i < NBEAMS_P; i++){ - for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ - normspec[j-ZeroChannels] = fabs(spec[i*NCHAN_P+j]-medspec[i]); - } - madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels); - } - - // mask - float vv; - float mythr = Thr/sqrt(1.*FBIN); - for (i = 0; i < NBEAMS_P; i++){ - - // implement FBIN - for (j = ZeroChannels; j < NCHAN_P-ZeroChannels-FBIN; j++) { - vv = 0.; - for (int k=0;k mythr*madspec[i]) mask[i*NCHAN_P+j] = 1; - - } - - } - - free(medspec); - free(madspec); - free(normspec); - -} - -void simple_tsflag(float* spec, float Thr, int * mask) { - - int i, j; - float* medspec; // median values for each beam spectrum - float* madspec; // mad for each beam spectrum - float* normspec; // corrected spec - median value (for MAD calculation) - - medspec = (float *)malloc(sizeof(float)*NBEAMS_P); - madspec = (float *)malloc(sizeof(float)*NBEAMS_P); - normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P); - - int nFilt, idx; - - // calculate median value for each beam - for (i = 0; i < NBEAMS_P; i++) - medspec[i] = medval(spec + i*NTIMES_P,NTIMES_P/16); - - // compute MAD for each beam - for (i = 0; i < NBEAMS_P; i++){ - for (j = 0; j < NTIMES_P/16; j++){ - normspec[j] = fabs(spec[i*NTIMES_P+j]-medspec[i]); - } - madspec[i] = medval(normspec,NTIMES_P/16); - } - - // mask - float vv; - float mythr = Thr; - for (i = 0; i < NBEAMS_P; i++){ - - for (j = 0; j < NTIMES_P; j++) { - - vv = spec[i*NTIMES_P+j]-medspec[i]; - if (vv > mythr*madspec[i]) mask[i*NTIMES_P+j] = 1; - - } - - } - - free(medspec); - free(madspec); - free(normspec); - -} - - -void channflag(float* spec, float Thr, int * mask) { - - int i, j; - float* baselinecorrec; // baseline correction - float* CorrecSpec; // corrected spectrum - float* medspec; // median values for each beam spectrum - float* madspec; // mad for each beam spectrum - float* normspec; // corrected spec - median value (for MAD calculation) - - baselinecorrec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - CorrecSpec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - medspec = (float *)malloc(sizeof(float)*NBEAMS_P); - madspec = (float *)malloc(sizeof(float)*NBEAMS_P); - normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - - - int ZeroChannels = 128; - int nFiltSize = 21; - int nFilt, idx; - - // calculate median filtered spectrum - for (i=0;i=nFiltSize) - CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+j,nFiltSize); - else - CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+NCHAN_P-ZeroChannels-nFiltSize,nFiltSize); - - } - } - - // calculate median value for each beam - for (i = 0; i < NBEAMS_P; i++) - medspec[i] = medval(CorrecSpec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels); - - // compute MAD for each beam - for (i = 0; i < NBEAMS_P; i++){ - for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ - normspec[j-ZeroChannels] = fabs(CorrecSpec[i*NCHAN_P+j]-medspec[i]); - } - madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels); - } - - // mask - for (i = 0; i < NBEAMS_P; i++){ - for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){ - if (CorrecSpec[i*NCHAN_P+j] > Thr * madspec[i] || CorrecSpec[i*NCHAN_P+j] < - Thr * madspec[i]) - mask[i*NCHAN_P+j] = 1; - - // for permanent flagging - for (int kk=0;kk arr[(j+1)*stride+chan]) { - - tt = arr[(j+1)*stride+chan]; - arr[(j+1)*stride+chan] = arr[(j)*stride+chan]; - arr[(j)*stride+chan] = tt; - - } - } - } - - } - - for (int i=0;iai_family,res->ai_socktype,res->ai_protocol); - bind(fd,res->ai_addr,res->ai_addrlen); - memset(buffer,'\0',sizeof(buffer)); - syslog(LOG_INFO, "control_thread: waiting for packet"); - ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len); - - syslog(LOG_INFO, "control_thread: received buffer string %s",buffer); - strcpy(tbuf,buffer); - trignum++; - - // interpret buffer string - char * rest = buffer; - int tmp_dumpbm = (float)(strtof(strtok(rest, "-"),&endptr)); - if (tmp_dumpbm<0 || tmp_dumpbm>63) tmp_dumpbm=32; - char * tmp_flnam = strtok(NULL, "-"); - - if (!dump_pending) { - strcpy(flnam,tmp_flnam); - dumpbm = tmp_dumpbm; - syslog(LOG_INFO, "control_thread: received command to add pulse %s to beam %d",flnam,dumpbm); - if (!(fin=fopen(flnam,"rb"))) { - syslog(LOG_INFO,"cannot open %s",flnam); - } - else { - fread(tmpblock,sizeof(double),1024*16384,fin); - - // do manipulation of data - maxval = 0.; - for (int i=0;i<16384*1024;i++) { - if (tmpblock[i]>maxval) maxval = tmpblock[i]; - } - for (int i=0;i<16384;i++) { - for (int j=0;j<1024;j++) { - //ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]*2.*SIG/maxval); - ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]); - } - } - - fclose(fin); - syslog(LOG_INFO, "control_thread: finished processing pulse - setting dump_pending"); - } - } - - if (dump_pending) { - syslog(LOG_ERR, "control_thread: BACKED UP - ignoring %s",tbuf); - } - - if (!dump_pending) dump_pending = 1; - - close(fd); - - } - - free (buffer); - free (tbuf); - free(tmpblock); - - if (ctx->verbose) - syslog(LOG_INFO, "control_thread: exiting"); - -} - - -void usage() -{ - fprintf (stdout, - "flagger [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default dada]\n" - " -o out_key [default caca]\n" - " -t flagging threshold [default 5.0]\n" - " -f output spectra file\n" - " -g output beam power file\n" - " -n number of blocks in baseline spec aver (must be <=16 and >=1, default 5)\n" - " -p adjust noise level according to power\n" - " -m generate random data\n" - " -s time-series flagging and threshold [no default]\n" - " -q modulation index threshold for tot pwr flagging [default 0.0005]\n" - " -h print usage\n"); -} - - -int main(int argc, char**argv) -{ - - // syslog start - openlog ("gpu_flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // set cuda device - cudaSetDevice(1); - - // read command line args - - // data block HDU keys - key_t in_key = 0x0000dada; - key_t out_key = 0x0000caca; - - // command line arguments - int core = -1; - int arg = 0; - double thresh = 5.0; - float mod_thresh = 0.0005; - int naver = 5; - char * fnam; - char * fnam2; - FILE *fout; - FILE *fout2; - FILE *f0; - - fnam = (char *)malloc(sizeof(char)*200); - fnam2 = (char *)malloc(sizeof(char)*200); - int fwrite = 0; - int fwrite2 = 0; - int pwr = 0; - int mkrand = 0; - int tsflag = 0; - float tsthresh = 10.; - - while ((arg=getopt(argc,argv,"c:t:i:o:f:g:a:k:s:mdph")) != -1) - { - switch (arg) - { - case 'k': - strcpy(iP,optarg); - break; - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - strcpy(fnam,optarg); - fwrite = 1; - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'q': - if (optarg) - { - mod_thresh = atof(optarg); - break; - } - else - { - syslog(LOG_ERR,"-q flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'a': - if (optarg) - { - naver = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-a flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'g': - if (optarg) - { - //strcpy(fnam2,optarg); - sprintf(fnam2,"%s_%f.dat",optarg,40587.0+time(NULL)/86400.0); - fwrite2 = 1; - break; - } - else - { - syslog(LOG_ERR,"-g flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - thresh = atof(optarg); - syslog(LOG_INFO,"modified THRESH to %g",thresh); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 's': - if (optarg) - { - tsthresh = atof(optarg); - tsflag=1; - syslog(LOG_INFO,"TSTHRESH is %g",tsthresh); - break; - } - else - { - syslog(LOG_ERR,"-s flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'p': - pwr=1; - break; - case 'm': - mkrand=1; - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - dsaX_pulse_t udpdb; - udpdb.verbose = DEBUG; - float * pulsedata = (float *)malloc(sizeof(float)*256*16384*1024); - udpdb.block = pulsedata; - - // CONNECT AND READ FROM BUFFER - - dada_hdu_t* hdu_in = 0; // header and data unit - hdu_in = dada_hdu_create (); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to input buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to input buffer"); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO,"connected to input buffer"); - - uint64_t header_size = 0; - // read the header from the input HDU - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - - // mark the input header as cleared - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){ - syslog (LOG_ERR,"could not mark header as cleared"); - return EXIT_FAILURE; - } - - uint64_t block_id, bytes_read = 0; - unsigned char *in_data; - char *cin_data; - - // OUTPUT BUFFER - dada_hdu_t* hdu_out = 0; - hdu_out = dada_hdu_create (); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"flagged_data: could not connect to dada buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write (hdu_out) < 0) { - syslog (LOG_ERR,"flagged_data: could not lock to dada buffer"); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO,"connected to output"); - - - //// OUTPUT BUFFER - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - header_size = HDR_SIZE; - if (!header_out) - { - syslog(LOG_ERR,"couldn't read header_out"); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - return EXIT_FAILURE; - } - uint64_t written=0; - - if (DEBUG) syslog(LOG_INFO,"copied header"); - - //////////////// - - // declare stuff for host and GPU - unsigned char * d_data; - float * d_pulse; - unsigned char * h_bm0 = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P); - cudaMalloc((void **)&d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char)); - cudaMalloc((void **)&d_pulse, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float)); - unsigned char * h_data = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P); - int * h_mask = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P); - int * d_mask; - cudaMalloc((void **)&d_mask, NBEAMS_P*NCHAN_P*sizeof(int)); - int * h_tsmask = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P); - int * d_tsmask; - cudaMalloc((void **)&d_tsmask, NBEAMS_P*NTIMES_P*sizeof(int)); - float * d_spec, * d_oldspec; - cudaMalloc((void **)&d_spec, NBEAMS_P*NCHAN_P*sizeof(float)); - cudaMalloc((void **)&d_oldspec, NBEAMS_P*NCHAN_P*sizeof(float)); - float * d_ts; - cudaMalloc((void **)&d_ts, NBEAMS_P*NTIMES_P*sizeof(float)); - float * h_bpwr = (float *)malloc(sizeof(float)*NBEAMS_P); - float * d_bpwr; - cudaMalloc((void **)&d_bpwr, NBEAMS_P*sizeof(float)); - float * h_spec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float * h_ts = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P); - float * h_beam = (float *)malloc(sizeof(float)*NBEAMS_P); - float * h_bmask = (float *)malloc(sizeof(float)*NBEAMS_P); - float * h_subspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float * h_var = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float * h_max = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float * h_pp = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float * h_oldspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float *h_spec0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float *h_var0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P); - float *d_spec0, *d_var0; - cudaMalloc((void **)&d_spec0, NBEAMS_P*NCHAN_P*naver*sizeof(float)); - cudaMalloc((void **)&d_var0, NBEAMS_P*NCHAN_P*naver*sizeof(float)); - for (int i=0;i>>(d_repval,time(NULL)); - for (int i=0;idata_block, &bytes_read, &block_id); - in_data = (unsigned char *)(cin_data); - gotDada=1; - blockn++; - } - else - in_data = (unsigned char *)(tmp_indata); - - // deal with bm0 - /*memcpy(h_data+NTIMES_P*NCHAN_P,in_data+NTIMES_P*NCHAN_P,(NBEAMS_P-1)*NTIMES_P*NCHAN_P); - memcpy(h_bm0,in_data,NTIMES_P*NCHAN_P); - memcpy(h_data,h_data+NTIMES_P*NCHAN_P,NTIMES_P*NCHAN_P);*/ - - - if (DEBUG) syslog(LOG_INFO,"read block"); - - /* - if not first block, correct data - 1 - measure spectrum - 2 - measure varspec - if first block, proceed. - else - 3 - measure maximum value - 4 - use three spectra to derive channel flags - 5 - flag - */ - - // copy data to device - cudaMemcpy(d_data, in_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyHostToDevice); - //cudaMemset(d_data, 8, NBEAMS_P*NTIMES_P*NCHAN_P); - - // if not first block, correct data - if (started==1 || prestart==1) - scaley<<>>(d_data, d_spec0, d_var0); - - if (DEBUG) syslog(LOG_INFO,"copied data and scaled"); - - // measure spectrum and varspec - calc_spectrum<<>>(d_data, d_spec); - calc_varspec<<>>(d_data, d_spec, d_var); - cudaMemcpy(h_spec, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(h_var, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); - if (started==0) { - for (int i=0;i= mod_thresh) { - - syslog(LOG_INFO,"mod_idx %f (threshold %f), noise replacement",fabs(tpwr-prev_tpwr)/prev_tpwr,mod_thresh); - - for (int i=0;i>>(d_data, d_max); - calc_ppspec<<>>(d_data, d_pp); - - // derive channel flags - cudaMemcpy(h_max, d_max, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(h_pp, d_pp, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost); - for (int i=0;i>>(d_data, d_pulse); - syslog(LOG_INFO, "added %s to beam %d", flnam, dumpbm); - - dump_pending=0; - - } - - if (mkrand==0) - flag<<>>(d_data, d_idx, d_repval, d_bpwr); - - // ts flagging if needed - if (tsflag) { - - make_ts<<>>(d_data,d_ts); - syslog(LOG_INFO,"made ts"); - cudaMemcpy(h_ts, d_ts, NBEAMS_P*NTIMES_P*sizeof(float), cudaMemcpyDeviceToHost); - syslog(LOG_INFO,"copied ts"); - for (int i=0;i>>(d_data, d_tsidx, d_repval, d_bpwr); - syslog(LOG_INFO,"flagged ts"); - - } - - } - - } - - // deal with tpwr - prev_tpwr = tpwr; - - // copy data to host and write to buffer - cudaMemcpy(h_data, d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost); - - // deal with bm0 - //memcpy(h_data,h_bm0,NTIMES_P*NCHAN_P); - - // close block after reading - if (prestart==0) { - ipcio_close_block_read (hdu_in->data_block, bytes_read); - if (DEBUG) syslog(LOG_DEBUG,"closed read block"); - written = ipcio_write (hdu_out->data_block, (char *)(h_data), BUF_SIZE); - if (written < BUF_SIZE) - { - syslog(LOG_ERR,"write error"); - return EXIT_FAILURE; - } - } - - if (prestart==1) { - syslog(LOG_INFO,"Finishing with pre-start run-through"); - prestart=0; - - // search for spec0 and var0 file - if (f0=fopen("/home/ubuntu/data/specvar0.dat","r")) { - - //f0=fopen("/home/ubuntu/data/specvar0.dat","r"); - for (int i=0;i0) { - cudaMemcpy(d_spec0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); - cudaMemcpy(d_var0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); - } - if (blockn==0) { - cudaMemcpy(d_spec0, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); - cudaMemcpy(d_var0, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice); - } - } - if (prestart==0 && gotDada==1 && blockn >= naver) { - started=1; - if (naver>1) fix_zspec<<>>(d_spec0, d_var0, naver); - cudaMemcpy(h_spec0, d_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost); - cudaMemcpy(h_var0, d_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost); - median_calc(h_spec0); - median_calc(h_var0); - cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice); - cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice); - syslog(LOG_INFO,"writing out weights..."); - - // write out weights - f0=fopen("/home/ubuntu/data/specvar.dat","w"); - for (int i=0;islCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R zp7TA=_s_Rs_FC&*zxTSn8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmwAyn>>v3?pVeBfwOoQe+PEYvZ7Y5s zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl+TXp08@tq>6C@%!3B5q~(eB}nqR zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8TkbHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+IlpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IAuFrmxO&YhZ~wdhLAEy!#2nsjZ}z~> zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~ zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3 zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF# zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L zca_wEE~QyS(Q%XfPr{^S6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4; z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs= zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW} zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~ zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT% zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz{{&!ul;o6 zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2 ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieug2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^ zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i4gi==D&!xLFV+qEb;CUh=e}`Uw@LW zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j; zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_ z4=KJRXf)R5=H<*{wB{# zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf zjPvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1 zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@Cc4mHub;XJni@LUc{4swI|z8 zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj z_jcpsY#c2^^5yX}X1h#qY}v zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP6y&@ z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~ z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8 zsMN98DK?jB(;S$Gi|ZijCLK#tI=rOA zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!R+lg9 z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCSSj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8 z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>MepJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kYNUVm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+*!je~i2Lxh5pz=S#iIU$@ zXZZ%~O5UJq3Hi4T6)DpY|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf z+{47z$Rv$CcEC*{#n^o&cE^L7_!N^sl zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f! zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U z$8Uj8##8B#djxirWkdX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^ zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0pXBEzFev3q*B48)FZZx7 zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D zQi63zj>}9`B|i! zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H` zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp> z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2t0xyC0<`H-vkk7u@}y2f z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PXB(xVCkHbT zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMePu^C2|l zc&*oriP^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NRvS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A z>3Qw1VSsC|L03X`a)_(Q(K@@}$YcrGi}IWf5PN&eendY zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f) z6u)}lRU|~!WKE~|tz0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*jh@cSar(vp{;zS z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR- zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e z0f%Tj482S zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$B*hiRoTyUSw)Q z{ijS|G%t1zqcL7!?5!L3{WJIP0knOl6as9MU_jb#8c z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@ zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS) z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cIx`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2 z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6 zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j* z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhhof%zh7nf8u!)p^(qzAUES%A z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye z{5aVe2)Bh3m|RkvluSm+3dKt}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^ znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~ zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u% zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdmNE0s5)fH{kbp#(HE>eNRIVCDuKqc{~_niWz1W}o~49L2pw|Jn_2%fkJu z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ< z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR zU=PiGL|aCbESCqQJV&b2u@8%50KOX3DrTu-cKU~l zzC$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~% z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2 z4iY(sE=xO0 zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQI+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{ zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^ z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCXicfma`AMwS zWC5od88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2 zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-< z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9 zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y&A<$j0-=*nn z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_ z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_ zXdI7Oye~wSAK7!-BBzY# zNCb7TUd=F3;?w7A_Um%cX-lcW?Leq~~}l5Z31Z* zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;? zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D zf6%EI=lcU-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yuMWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0n-1>PAc zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f z@W(pv>+s$PW92Yfm*lJL%Y41WaPH97Uxp*W5 zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1))ptinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`@SnX}F -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -FILE *output; - -void send_string(char *string) /* includefile */ -{ - int len; - len=strlen(string); - fwrite(&len, sizeof(int), 1, output); - fwrite(string, sizeof(char), len, output); -} - -void send_float(char *name,float floating_point) /* includefile */ -{ - send_string(name); - fwrite(&floating_point,sizeof(float),1,output); -} - -void send_double (char *name, double double_precision) /* includefile */ -{ - send_string(name); - fwrite(&double_precision,sizeof(double),1,output); -} - -void send_int(char *name, int integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(int),1,output); -} - -void send_char(char *name, char integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(char),1,output); -} - - -void send_long(char *name, long integer) /* includefile */ -{ - send_string(name); - fwrite(&integer,sizeof(long),1,output); -} - -void send_coords(double raj, double dej, double az, double za) /*includefile*/ -{ - if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj); - if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej); - if ((az != 0.0) || (az != -1.0)) send_double("az_start",az); - if ((za != 0.0) || (za != -1.0)) send_double("za_start",za); -} - -int main(int argc, char * argv[]) { - - // memory - uint64_t bsize = 2013265920, bls = 94371840; - unsigned char * allbeams = (unsigned char *)malloc(sizeof(unsigned char)*bsize); - memset(allbeams,0,bsize); - unsigned char * data = (unsigned char *)malloc(sizeof(unsigned char)*bls); - FILE *fin; - - // load in data if present - for (int i=0;i<16;i++) { - - if (strcmp(argv[i+1],"none")!=0) { - - fin=fopen(argv[i+1],"rb"); - fread(data,sizeof(unsigned char),bls,fin); - fclose(fin); - - for (int ibeam=0;ibeam<256;ibeam++) { - for (int itime=0;itime<15*512;itime++) { - for (int ich=0;ich<48;ich++) { - allbeams[ibeam*15*512*1024 + itime*1024 + i*48 + ich + 128] = data[itime*256*48 + ibeam*48 + ich]; - } - } - } - } - - } - - // make files - - char cmd[300], foutnam[400]; - sprintf(cmd,"mkdir -p %s_%s",argv[17],argv[18]); - system(cmd); - - for (int i=0;i<256;i++) { - - sprintf(foutnam,"%s_%s/%s_%d.fil",argv[17],argv[18],argv[18],i); - output = fopen(foutnam,"wb"); - - send_string("HEADER_START"); - send_string("source_name"); - send_string(argv[18]); - send_int("machine_id",1); - send_int("telescope_id",82); - send_int("data_type",1); // filterbank data - send_double("fch1",1530.0); // THIS IS CHANNEL 0 :) - send_double("foff",-0.244140625); - send_int("nchans",1024); - send_int("nbits",8); - send_double("tstart",55000.0); - send_double("tsamp",8.192e-6*8.*4.); - send_int("nifs",1); - send_string("HEADER_END"); - - fwrite(allbeams + i*15*512*1024,sizeof(unsigned char),15*512*1024,output); - - fclose(output); - - } - - - free(allbeams); - free(data); - -} diff --git a/src/test_read.c b/src/test_read.c deleted file mode 100644 index 2b5730a..0000000 --- a/src/test_read.c +++ /dev/null @@ -1,279 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -#define S 4096 - -/* global variables */ -int DEBUG = 0; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -b connect to bf hdu\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("test_read", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // TESTING and initialization - // threads - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - - // data block HDU keys - key_t in_key = CAPTURED_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int nthreads = 1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - nthreads = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - - case 'q': - syslog (LOG_INFO, "Quit here"); - return EXIT_SUCCESS; - - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - return EXIT_FAILURE; - } - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t bytes_read = 0; - char * block, * output_buffer; - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - for (int i=0;idata_block, bytes_read); - - } - - - dsaX_dbgpu_cleanup (hdu_in,0); - -} - - diff --git a/src/test_write.c b/src/test_write.c deleted file mode 100644 index 32dd25d..0000000 --- a/src/test_write.c +++ /dev/null @@ -1,452 +0,0 @@ -/* will reorder raw data for input to xgpu */ -#define __USE_GNU -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "sock.h" -#include "tmutil.h" -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "ipcio.h" -// Forward declaration to keep compiler happy -// Possible minor bug in PSRDada -int ipcio_check_pending_sod (ipcio_t* ); -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_capture.h" -#include "dsaX_def.h" - -#include -#include -#include - -#define S 4096 - -// data to pass to threads -struct data { - char * in; - int n_threads; - int thread_id; - ipcio_t * out; -}; - -/* global variables */ -int DEBUG = 0; -int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write); -int dada_bind_thread_to_core (int core); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write) -{ - - if (write==0) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } - dada_hdu_destroy (in); - - } - - if (write==1) { - - if (dada_hdu_unlock_write (in) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_in"); - } - dada_hdu_destroy (in); - - } - -} - -void usage() -{ - fprintf (stdout, - "dsaX_reorder_raw [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -t number of threads [default 4]\n" - " -b connect to bf hdu\n" - " -i input key [default CAPTURED_BLOCK_KEY]\n" - " -o output key [default REORDER_BLOCK_KEY]\n" - " -q quitting after testing\n" - " -h print usage\n"); -} - -/* thread for data massaging */ -void * massage(void *args) { - - // basic stuff - struct data *d = args; - int thread_id = d->thread_id; - - - // set affinity - const pthread_t pid = pthread_self(); - const int core_id = cores[thread_id]; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (set_result != 0) - syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id); - const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset); - if (get_affinity != 0) - syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id); - if (CPU_ISSET(core_id, &cpuset)) - if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id); - - // extract from input data structure - char *in = (char *)d->in; - //char *out = (char *)d->out; - int nthreads = d->n_threads; - - // place in out - int i = thread_id*(S/nthreads); - //syslog(LOG_INFO,"thread %d: %d",thread_id,i); - memcpy (d->out->curbuf + i, in + i, S/nthreads); - - /* return 0 */ - int thread_result = 0; - pthread_exit((void *) &thread_result); - -} - - -// MAIN - -int main (int argc, char *argv[]) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("test_write", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // TESTING and initialization - // threads - struct data args[16]; - pthread_t threads[16]; - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - void* result=0; - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - dada_hdu_t* hdu_out2 = 0; - - // data block HDU keys - key_t in_key = CAPTURED_BLOCK_KEY; - key_t out_key = REORDER_BLOCK_KEY; - key_t out_key2 = REORDER_BLOCK_KEY2; - - // command line arguments - int core = -1; - int nthreads = 1; - int bf = 0; - int arg = 0; - - while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1) - { - switch (arg) - { - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - nthreads = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - - case 'd': - DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - - case 'q': - syslog (LOG_INFO, "Quit here"); - return EXIT_SUCCESS; - - case 'b': - bf=1; - syslog (LOG_INFO, "Will write to bf dada hdu"); - break; - - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - - - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - - if (bf) { - header_out = ipcbuf_get_next_write (hdu_out2->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header2 block [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block2 filled [output]"); - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - return EXIT_FAILURE; - } - } - - - // record STATE info - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - uint64_t bytes_read = 0; - char * block, * output_buffer, * blockie; - output_buffer = (char *)malloc(sizeof(char)*block_out); - memset(output_buffer,1,block_out); - uint64_t written, block_id; - - // set up - - int observation_complete=0; - int blocks = 0; - int started = 0; - - - - syslog(LOG_INFO, "starting observation"); - - while (!observation_complete) { - - // open block - block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - if (started==0) { - syslog(LOG_INFO,"now in RUN state"); - started=1; - } - - // DO STUFF - - // sort out write - hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block); - hdu_out->data_block->marked_filled = 0; - //blockie = ipcio_open_block_write (hdu_out->data_block, &block_id); - - // set up data structure - for (int i=0; idata_block; - } - - if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads); - - for(int i=0; idata_block, output_buffer, block_out); - - // finish write - ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out); - ipcio_check_pending_sod (hdu_out->data_block); - hdu_out->data_block->marked_filled = 1; - //ipcio_close_block_write(hdu_out->data_block, block_out); - - if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks); - blocks++; - - - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - free(output_buffer); - - dsaX_dbgpu_cleanup (hdu_in,0); - dsaX_dbgpu_cleanup (hdu_out,1); - if (bf) dsaX_dbgpu_cleanup (hdu_out2,1); - //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2); - -} - - From 75ee37b6eaa059047ce032d82add58dee686cc5a Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:48:15 -0700 Subject: [PATCH 12/30] Remove backups --- legacy/11_planar_complex_array.cu~ | 628 ----------------------------- legacy/CMakeLists.txt~ | 120 ------ legacy/dsaX_cutlass_interface.cu~ | 315 --------------- legacy/dsaX_cutlass_interface.h~ | 174 -------- legacy/planar_complex.cu~ | 85 ---- 5 files changed, 1322 deletions(-) delete mode 100644 legacy/11_planar_complex_array.cu~ delete mode 100644 legacy/CMakeLists.txt~ delete mode 100644 legacy/dsaX_cutlass_interface.cu~ delete mode 100644 legacy/dsaX_cutlass_interface.h~ delete mode 100644 legacy/planar_complex.cu~ diff --git a/legacy/11_planar_complex_array.cu~ b/legacy/11_planar_complex_array.cu~ deleted file mode 100644 index 23722b0..0000000 --- a/legacy/11_planar_complex_array.cu~ +++ /dev/null @@ -1,628 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/*! \file - \brief Planar Complex Array Example - - This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which - execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays - in global memory. - - These kernels represent complex matrices by storing the real and imaginary parts of the matrix in - disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts - as either column-major or row-major layouts with a single leading dimension indicating the stride - between columns or rows. - - The CUTLASS Library collects multiple template instantiations in a data structure and offers - a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. - - CUTLASS decouples matrix layout from complex transformation, so four possible transformations - are possible on the A and B operands: - - n: column-major - c: column-major complex conjugate - t: row-major - h: row-major complex conjugate - - To build strictly the planar complex kernels needed for general application, execute the following - CMake command in an empty build directory. - - $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ - -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex - - This builds all planar complex GEMM variants for Volta and Turing architectures. - - To build strictly the kernels needed for this example, an even narrower filter string may be - specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for - the 'CN' layout configuration (conjugate A operand with both A and B as column-major). - - $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ - -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn - - $ make 11_planar_complex_array - - $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 -*/ - -#include -#include - -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/distribution.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/tensor_view_io.h" -#include "cutlass/util/host_tensor_planar_complex.h" - -#include "cutlass/util/reference/device/tensor_fill.h" - -#include "cutlass/util/reference/device/gemm_planar_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" - -#include "cutlass/library/handle.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Result structure -struct Result { - - double runtime_ms; - double gflops; - cutlass::Status status; - cudaError_t error; - bool passed; - - // - // Methods - // - - Result( - double runtime_ms = 0, - double gflops = 0, - cutlass::Status status = cutlass::Status::kSuccess, - cudaError_t error = cudaSuccess - ): - runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Command line options parsing -struct Options { - - bool help; - - cutlass::gemm::GemmCoord problem_size; - int batch_count; - cutlass::complex alpha; - cutlass::complex beta; - - bool reference_check; - int iterations; - - Options(): - help(false), - problem_size({1024, 1024, 1024}), - batch_count(1), - reference_check(true), - iterations(20), - alpha(1), - beta() { } - - bool valid() { - return true; - } - - // Parses the command line - void parse(int argc, char const **args) { - cutlass::CommandLine cmd(argc, args); - - if (cmd.check_cmd_line_flag("help")) { - help = true; - } - - cmd.get_cmd_line_argument("m", problem_size.m()); - cmd.get_cmd_line_argument("n", problem_size.n()); - cmd.get_cmd_line_argument("k", problem_size.k()); - cmd.get_cmd_line_argument("batch", batch_count); - - cmd.get_cmd_line_argument("alpha", alpha.real()); - cmd.get_cmd_line_argument("alpha_i", alpha.imag()); - cmd.get_cmd_line_argument("beta", beta.real()); - cmd.get_cmd_line_argument("beta_i", beta.imag()); - - cmd.get_cmd_line_argument("iterations", iterations); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "11_planar_complex_array example\n\n" - << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement.\n\n" - << " --m= GEMM M dimension\n" - << " --n= GEMM N dimension\n" - << " --k= GEMM K dimension\n" - << " --batch= Number of GEMM operations executed in one batch\n" - << " --alpha= Epilogue scalar alpha (real part)\n" - << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" - << " --beta= Epilogue scalar beta (real part)\n\n" - << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" - << " --iterations= Number of profiling iterations to perform.\n"; - - out << "\n\nExamples:\n\n" - << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; - - return out; - } - - /// Compute performance in GFLOP/s - double gflops(double runtime_s) const { - - // Number of real-valued multiply-adds - int64_t fmas = problem_size.product() * batch_count * 4; - - // Two flops per multiply-add - return 2.0 * double(fmas) / double(1.0e9) / runtime_s; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -/// Performance test environment for planar complex -class TestbedPlanarComplex { -public: - - // Half-precision input and output - using Element = cutlass::half_t; - - // Configurations for layouts and internal computation - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - using LayoutC = cutlass::layout::ColumnMajor; - using ElementCompute = float; - using ElementAccumulator = float; - - // - // Data members - // - - cutlass::library::Handle handle; - - cutlass::gemm::GemmCoord problem_size; - int batch_count; - cutlass::DeviceAllocation tensor_A; - cutlass::DeviceAllocation tensor_B; - cutlass::DeviceAllocation tensor_C; - cutlass::DeviceAllocation tensor_D; - cutlass::DeviceAllocation tensor_D_ref; - - cutlass::DeviceAllocation ptr_A_real; - cutlass::DeviceAllocation ptr_A_imag; - cutlass::DeviceAllocation ptr_B_real; - cutlass::DeviceAllocation ptr_B_imag; - cutlass::DeviceAllocation ptr_C_real; - cutlass::DeviceAllocation ptr_C_imag; - cutlass::DeviceAllocation ptr_D_real; - cutlass::DeviceAllocation ptr_D_imag; - - // - // Methods - // - - TestbedPlanarComplex( - Options const &options - ): - problem_size(options.problem_size), batch_count(options.batch_count) { - - // Allocate device memory for batched planar complex GEMM - tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); - tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); - tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - - ptr_A_real.reset(batch_count); - ptr_A_imag.reset(batch_count); - ptr_B_real.reset(batch_count); - ptr_B_imag.reset(batch_count); - ptr_C_real.reset(batch_count); - ptr_C_imag.reset(batch_count); - ptr_D_real.reset(batch_count); - ptr_D_imag.reset(batch_count); - - } - - void initialize_rand() { - - uint64_t seed = 1073; - - // Use small integers to simplify correctness checking - int scope_max = 6; - int scope_min = -6; - - cutlass::reference::device::BlockFillRandomUniform( - tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); - - cutlass::reference::device::BlockFillRandomUniform( - tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); - - cutlass::reference::device::BlockFillRandomUniform( - tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); - } - - Result profile(Options const &options) { - - Result result; - - initialize(); - - Element *ptr_A = tensor_A.get(); - Element *ptr_B = tensor_B.get(); - Element *ptr_C = tensor_C.get(); - Element *ptr_D = tensor_D.get(); - - int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; - int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; - int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; - int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; - - typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); - typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); - typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); - typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); - - - int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); - int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); - int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); - int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); - - // - // Configure pointers in global memory - // - - struct { - Element *base; - void **ptr_real; - void **ptr_imag; - int64_t batch_stride; - int64_t imag_stride; - } tensors[] = { - { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, - { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, - { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, - { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} - }; - - for (auto const &tensor : tensors) { - for (int idx = 0; idx < batch_count; ++idx) { - - void *ptr_real = tensor.base + idx * tensor.batch_stride; - void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; - - cudaError_t error = cudaMemcpy( - tensor.ptr_real + idx, - &ptr_real, - sizeof(void *), - cudaMemcpyHostToDevice); - - if (error != cudaSuccess) { - throw std::runtime_error("Failed to copy pointer to device memory"); - } - - error = cudaMemcpy( - tensor.ptr_imag + idx, - &ptr_imag, - sizeof(void *), - cudaMemcpyHostToDevice); - - if (error != cudaSuccess) { - throw std::runtime_error("Failed to copy pointer to device memory"); - } - } - } - - // - // Construct events - // - - cudaEvent_t events[2]; - - for (auto & event : events) { - result.error = cudaEventCreate(&event); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; - return -1; - } - } - - // Record an event at the start of a series of GEMM operations - result.error = cudaEventRecord(events[0]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // - // Run profiling loop - // - - for (int iter = 0; iter < options.iterations; ++iter) { - - // - // Execute the planar complex array GEMM kernel via the CUTLASS Library's - // dispatch routines. - // - // Note, for planar complex array GEMM kernels, all numeric type arguments - // specify the data type of the base real types. These are understood to - // apply to planar complex representations of matrices in memory and to complex - // structures for scalars. - // - // See tools/library/include/cutlass/library/handle.h for more details. - // - - result.status = handle.gemm_planar_complex_array( - - problem_size.m(), // expected GEMM M dimension - problem_size.n(), // expected GEMM N dimension - problem_size.k(), // expected GEMM K dimension - batch_count, // Number of batched elements - - nullptr, - nullptr, - nullptr, - - cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation - cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars - - &options.alpha, // Pointer to alpha scalar, of type complex - - cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix - cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix - cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand - - ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix - ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix - - lda, // Leading dimension of real part of A matrix - lda, // Leading dimension of imaginary part of A matrix - - cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix - cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix - cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand - - ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix - ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix - - ldb, // Leading dimension of real part of B matrix - ldb, // Leading dimension of imaginary part of B matrix - - &options.beta, // Pointer to beta scalar, of type complex - - cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices - - ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix - ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix - - ldc, // Leading dimension of real part of C matrix - ldc, // Leading dimension of imaginary part of C matrix - - ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix - ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix - - ldd, // Leading dimension of real part of D matrix - ldd // Leading dimension of imaginary part of D matrix - ); - - if (result.status != cutlass::Status::kSuccess) { - std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; - return result; - } - } - - // - // Stop profiling loop - // - - // Record an event when the GEMM operations have been launched. - result.error = cudaEventRecord(events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Wait for work on the device to complete. - result.error = cudaEventSynchronize(events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Measure elapsed runtime - float runtime_ms = 0; - result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Compute average runtime and GFLOPs. - result.runtime_ms = double(runtime_ms) / double(options.iterations); - result.gflops = options.gflops(result.runtime_ms / 1000.0); - - // Cleanup - for (auto event : events) { - (void)cudaEventDestroy(event); - } - - if (handle.get_last_operation()) { - std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; - } - - // - // Compute reference in device code - // - - if (options.reference_check) { - - result.passed = true; - - for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { - cutlass::reference::device::GemmPlanarComplex< - Element, LayoutA, - Element, LayoutB, - Element, LayoutC, - ElementAccumulator - >( - problem_size, - options.alpha, - {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, - cutlass::ComplexTransform::kConjugate, - {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, - cutlass::ComplexTransform::kNone, - options.beta, - {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, - {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} - ); - - Element epsilon = 0.1_hf; - Element nonzero_floor = 0.1_hf; - - result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( - tensor_D.get() + idx * batch_stride_D, - tensor_D_ref.get() + idx * batch_stride_D, - batch_stride_D, - epsilon, - nonzero_floor - ); - } - - if (result.passed) { - std::cout << "Reference check passed." << std::endl; - } - else { - std::cerr << "Error - reference check failed." << std::endl; - } - } - - std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; - - return result; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -int main(int argc, char const **args) { - - // - // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. - // - // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. - // - // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. - // - - cudaDeviceProp props; - - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - if (props.major < 7) { - std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." - << std::endl; - - // Returning zero so this passes on older architectures. Its actions are no-op. - return 0; - } - else if (props.major == 7 && props.minor <= 2) { - // - // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. - // - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - - // Returning zero so this passes on older Toolkits. Its actions are no-op. - return 0; - } - } - else if (props.major == 7 && props.minor >= 5) { - // - // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. - // - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { - std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - - // Returning zero so this passes on older Toolkits. Its actions are no-op. - return 0; - } - } - else { - // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. - // - // fall through - } - - // - // Parse options - // - - Options options; - - options.parse(argc, args); - - if (options.help) { - options.print_usage(std::cout) << std::endl; - return 0; - } - - // Execute one problem size - if (!options.valid()) { - std::cerr << "Invalid problem." << std::endl; - return -1; - } - - TestbedPlanarComplex testbed(options); - - Result result = testbed.profile(options); - - return result.passed ? 0 : -1; -} - -///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/legacy/CMakeLists.txt~ b/legacy/CMakeLists.txt~ deleted file mode 100644 index 0783d51..0000000 --- a/legacy/CMakeLists.txt~ +++ /dev/null @@ -1,120 +0,0 @@ -enable_language(CUDA) - -include_directories(${PSRDada_SOURCE_DIR}/src) -include_directories(${xGPU_SOURCE_DIR}/src) - -set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) -set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) - -# DSA Fast Time Domain functions -#------------------------------- -add_executable(test_write test_write.c) -target_link_libraries(test_write ${PSRDada_LIB}) - -add_executable(test_read test_read.c) -target_link_libraries(test_read ${PSRDada_LIB}) - -add_executable(dsaX_trigger dsaX_trigger.c) -target_link_libraries(dsaX_trigger ${PSRDada_LIB}) - -add_executable(dsaX_filTrigger dsaX_filTrigger.c) -target_link_libraries(dsaX_filTrigger ${PSRDada_LIB}) - -# DMH: Has a 'sigproc' dependency, low priority -if(0) - add_executable(splice_offline_beams splice_offline_beams.c) - target_link_libraries(splice_offline_beams ${PSRDada_LIB}) - - add_executable(dsaX_writeFil dsaX_writeFil.c) - target_link_libraries(dsaX_writeFil ${PSRDada_LIB}) - - add_executable(dsaX_splice dsaX_splice.c) - target_link_libraries(dsaX_splice ${PSRDada_LIB}) - - add_executable(gpu_flagger gpu_flagger.cu) - target_link_libraries(gpu_flagger ${PSRDada_LIB}) -endif() - -add_executable(dsaX_store dsaX_store.c) -target_link_libraries(dsaX_store ${PSRDada_LIB}) - -add_executable(dsaX_fluff dsaX_fluff.c) -target_link_libraries(dsaX_fluff ${PSRDada_LIB}) - -# DMH: intrinsics compilation error -#add_executable(dsaX_reorder dsaX_reorder.c) -#target_link_libraries(dsaX_reorder ${PSRDada_LIB}) - -# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’: -#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow] -# 145 | uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL; -add_executable(dsaX_nicdb dsaX_nicdb.c) -target_link_libraries(dsaX_nicdb ${PSRDada_LIB}) - -add_executable(dsaX_dbnic dsaX_dbnic.c) -target_link_libraries(dsaX_dbnic ${PSRDada_LIB}) - -add_executable(dsaX_capture dsaX_capture.c) -target_link_libraries(dsaX_capture ${PSRDada_LIB}) - -add_executable(dsaX_capture_thread dsaX_capture_thread.c) -target_link_libraries(dsaX_capture_thread ${PSRDada_LIB}) - -add_executable(dsaX_capture_manythread dsaX_capture_manythread.c) -target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB}) - -add_executable(dsaX_split dsaX_split.c) -target_link_libraries(dsaX_split ${PSRDada_LIB} -lm) - -add_executable(dsaX_merge dsaX_merge.c) -target_link_libraries(dsaX_merge ${PSRDada_LIB}) - -add_executable(dsaX_simplesplit dsaX_simplesplit.c) -target_link_libraries(dsaX_simplesplit ${PSRDada_LIB}) - -add_executable(dsaX_fake dsaX_fake.c) -target_link_libraries(dsaX_fake ${PSRDada_LIB}) - -add_executable(dsaX_splitup dsaX_splitup.c) -target_link_libraries(dsaX_splitup ${PSRDada_LIB}) - -add_executable(dsaX_copydb dsaX_copydb.c) -target_link_libraries(dsaX_copydb ${PSRDada_LIB}) - -# DMH: fitsio dependency -if(0) - add_executable(dsaX_writevis dsaX_writevis.c) - target_link_libraries(dsaX_writevis ${PSRDada_LIB}) -endif() - -# DMH: XGPU dependencies -add_executable(dsaX_wrangle dsaX_wrangle.c) -target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB}) - -add_executable(dsaX_testdada dsaX_testdada.c) -target_link_libraries(dsaX_testdada ${PSRDada_LIB}) - -add_executable(dsaX_xgpu dsaX_xgpu.cu) -target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY}) - -add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu) -target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) - -add_executable(dsaX_reorder_raw dsaX_reorder_raw.c) -target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB}) - -add_executable(fil2dada fil2dada.c) -target_link_libraries(fil2dada ${PSRDada_LIB}) - -add_executable(dumpfil dumpfil.c) -target_link_libraries(dumpfil ${PSRDada_LIB}) - -add_executable(dsaX_beamformer dsaX_beamformer.cu) -target_link_libraries(dsaX_beamformer ${PSRDada_LIB}) - -add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu) -target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB}) - -add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu) -target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB}) -#------------------------------------------------------ diff --git a/legacy/dsaX_cutlass_interface.cu~ b/legacy/dsaX_cutlass_interface.cu~ deleted file mode 100644 index a51d5a2..0000000 --- a/legacy/dsaX_cutlass_interface.cu~ +++ /dev/null @@ -1,315 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "dsaX_cutlass_interface.h" - -DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): - problem_size(options.problem_size), batch_count(options.batch_count) { - - // Allocate device memory for batched planar complex GEMM - tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); - tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); - tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); - - ptr_A_real.reset(batch_count); - ptr_A_imag.reset(batch_count); - ptr_B_real.reset(batch_count); - ptr_B_imag.reset(batch_count); - ptr_C_real.reset(batch_count); - ptr_C_imag.reset(batch_count); - ptr_D_real.reset(batch_count); - ptr_D_imag.reset(batch_count); -} - -// DMH: Replace this with data from DSA-FTD -void DSA_FTD_ComplexGEMM_CUTLASS::initialize() { - - if(testing) { - uint64_t seed = 1234; - - // Use small integers to simplify correctness checking - int scope_max = 6; - int scope_min = -6; - - BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); - BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); - BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); - } else { - // DMH: construct DSA-FTD interface data transfer interface - } - - ptr_A = tensor_A.get(); - ptr_B = tensor_B.get(); - ptr_C = tensor_C.get(); - ptr_D = tensor_D.get(); - - batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; - batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; - batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; - batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; - - lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); - ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); - ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); - ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); - - imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); - imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); - imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); - imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); - -} - -Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) { - - Result result; - - initialize(); - - // Configure pointers in global memory - struct { - Element *base; - void **ptr_real; - void **ptr_imag; - int64_t batch_stride; - int64_t imag_stride; - } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, - { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, - { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, - { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}}; - - for (auto const &tensor : tensors) { - for (int idx = 0; idx < batch_count; ++idx) { - - cudaError_t error; - void *ptr_real = tensor.base + idx * tensor.batch_stride; - void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; - - error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice); - if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); - - error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice); - if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory"); - - } - } - - - cudaEvent_t events[2]; - for (auto & event : events) { - result.error = cudaEventCreate(&event); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; - return -1; - } - } - - // Record an event at the start of a series of GEMM operations - result.error = cudaEventRecord(events[0]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Run profiling loop - //------------------- - // Execute the planar complex array GEMM kernel via the CUTLASS Library's - // dispatch routines. - // - // Note, for planar complex array GEMM kernels, all numeric type arguments - // specify the data type of the base real types. These are understood to - // apply to planar complex representations of matrices in memory and to complex - // structures for scalars. - // - // See tools/library/include/cutlass/library/handle.h for more details. - // - for (int iter = 0; iter < options.iterations; ++iter) { - - result.status = handle.gemm_planar_complex_array( - problem_size.m(), // expected GEMM M dimension - problem_size.n(), // expected GEMM N dimension - problem_size.k(), // expected GEMM K dimension - batch_count, // Number of batched elements - - nullptr, - nullptr, - nullptr, - - cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation - cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars - - &options.alpha, // Pointer to alpha scalar, of type complex - - cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix - cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix - cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand - - ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix - ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix - - lda, // Leading dimension of real part of A matrix - lda, // Leading dimension of imaginary part of A matrix - - cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix - cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix - cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand - - ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix - ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix - - ldb, // Leading dimension of real part of B matrix - ldb, // Leading dimension of imaginary part of B matrix - - &options.beta, // Pointer to beta scalar, of type complex - - cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices - - ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix - ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix - - ldc, // Leading dimension of real part of C matrix - ldc, // Leading dimension of imaginary part of C matrix - - ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix - ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix - - ldd, // Leading dimension of real part of D matrix - ldd // Leading dimension of imaginary part of D matrix - ); - - if (result.status != cutlass::Status::kSuccess) { - std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; - return result; - } - } - - // Record an event when the GEMM operations have been launched. - result.error = cudaEventRecord(events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Wait for work on the device to complete. - result.error = cudaEventSynchronize(events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Measure elapsed runtime - float runtime_ms = 0; - result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); - if (result.error != cudaSuccess) { - std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; - return result; - } - - // Compute average runtime and GFLOPs. - result.runtime_ms = double(runtime_ms) / double(options.iterations); - result.gflops = options.gflops(result.runtime_ms / 1000.0); - - // Cleanup - for (auto event : events) { - (void)cudaEventDestroy(event); - } - - if (handle.get_last_operation()) { - std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; - } - - // Compute reference in device code - if (options.reference_check) { - - result.passed = true; - - for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { - // Define the GEMM through templates - GemmPlanarComplex - (problem_size, options.alpha, - {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, - cutlass::ComplexTransform::kConjugate, - {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, - cutlass::ComplexTransform::kNone, - options.beta, - {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, - {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} - ); - - Element epsilon = 0.1_hf; - Element nonzero_floor = 0.1_hf; - - result.passed = BlockCompareRelativelyEqual - ( - tensor_D.get() + idx * batch_stride_D, - tensor_D_ref.get() + idx * batch_stride_D, - batch_stride_D, - epsilon, - nonzero_floor - ); - } - - if (result.passed) std::cout << "Reference check passed." << std::endl; - else std::cerr << "Error - reference check failed." << std::endl; - } - - std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; - - return result; -} - - int main(int argc, char const **args) { - cudaDeviceProp props; - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - Options options; - options.parse(argc, args); - - if (options.help) { - options.print_usage(std::cout) << std::endl; - return 0; - } - - // Compute GEMM - testing = true; - DSA_FTD_ComplexGEMM_CUTLASS gemm(options); - Result result = gemm.run(options); - - return result.passed ? 0 : -1; -} - diff --git a/legacy/dsaX_cutlass_interface.h~ b/legacy/dsaX_cutlass_interface.h~ deleted file mode 100644 index 42a3e8a..0000000 --- a/legacy/dsaX_cutlass_interface.h~ +++ /dev/null @@ -1,174 +0,0 @@ -#pragma once - -#include -#include -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/util/command_line.h" -#include "cutlass/util/distribution.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/tensor_view_io.h" -#include "cutlass/util/host_tensor_planar_complex.h" -#include "cutlass/util/reference/device/tensor_fill.h" -#include "cutlass/util/reference/device/gemm_planar_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/library/handle.h" - -using namespace cutlass; -using namespace gemm; -using namespace library; -using namespace layout; -using namespace reference; -using namespace device; - -// Result structure -struct Result { - - double runtime_ms; - double gflops; - Status status; - cudaError_t error; - bool passed; - - Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess): - runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } -}; - -// Command line options parsing (testing) -struct Options { - - bool help; - GemmCoord problem_size; - int batch_count; - complex alpha; - complex beta; - bool reference_check; - int iterations; - - Options(): - help(false), - problem_size({1024, 1024, 1024}), - batch_count(1), - reference_check(false), - iterations(20), - alpha(1), - beta() { } - - // Parses the command line - void parse(int argc, char const **args) { - - CommandLine cmd(argc, args); - if (cmd.check_cmd_line_flag("help")) { - help = true; - } - - cmd.get_cmd_line_argument("m", problem_size.m()); - cmd.get_cmd_line_argument("n", problem_size.n()); - cmd.get_cmd_line_argument("k", problem_size.k()); - cmd.get_cmd_line_argument("batch", batch_count); - - cmd.get_cmd_line_argument("alpha", alpha.real()); - cmd.get_cmd_line_argument("alpha_i", alpha.imag()); - cmd.get_cmd_line_argument("beta", beta.real()); - cmd.get_cmd_line_argument("beta_i", beta.imag()); - - cmd.get_cmd_line_argument("iterations", iterations); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "dsaX_cutlass_interface\n\n" - << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement.\n\n" - << " --m= GEMM M dimension\n" - << " --n= GEMM N dimension\n" - << " --k= GEMM K dimension\n" - << " --batch= Number of GEMM operations executed in one batch\n" - << " --alpha= Epilogue scalar alpha (real part)\n" - << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" - << " --beta= Epilogue scalar beta (real part)\n\n" - << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" - << " --iterations= Number of profiling iterations to perform.\n"; - - return out; - } - - /// Compute performance in GFLOP/s - double gflops(double runtime_s) const { - - // Number of real-valued multiply-adds - int64_t fmas = problem_size.product() * batch_count * 4; - - // Two flops per multiply-add - return 2.0 * double(fmas) / double(1.0e9) / runtime_s; - } -}; - -/// Performance test environment for planar complex -class DSA_FTD_ComplexGEMM_CUTLASS { - - // Half-precision input and output - using Element = half_t; - - // Configurations for layouts and internal computation - using LayoutA = ColumnMajor; - using LayoutB = ColumnMajor; - using LayoutC = ColumnMajor; - using ElementCompute = float; - using ElementAccumulator = float; - - Handle handle; - - GemmCoord problem_size; - int batch_count; - DeviceAllocation tensor_A; - DeviceAllocation tensor_B; - DeviceAllocation tensor_C; - DeviceAllocation tensor_D; - DeviceAllocation tensor_D_ref; - - DeviceAllocation ptr_A_real; - DeviceAllocation ptr_A_imag; - DeviceAllocation ptr_B_real; - DeviceAllocation ptr_B_imag; - DeviceAllocation ptr_C_real; - DeviceAllocation ptr_C_imag; - DeviceAllocation ptr_D_real; - DeviceAllocation ptr_D_imag; - - Element *ptr_A; - Element *ptr_B; - Element *ptr_C; - Element *ptr_D; - - int64_t batch_stride_A; - int64_t batch_stride_B; - int64_t batch_stride_C; - int64_t batch_stride_D; - - typename LayoutA::Stride::Index lda; - typename LayoutB::Stride::Index ldb; - typename LayoutC::Stride::Index ldc; - typename LayoutC::Stride::Index ldd; - - int64_t imag_stride_A; - int64_t imag_stride_B; - int64_t imag_stride_C; - int64_t imag_stride_D; - - bool testing; - -public: - // Constructors - DSA_FTD_ComplexGEMM_CUTLASS(Options const &options); - DSA_FTD_ComplexGEMM_CUTLASS(); - - // Methods - void initialize(); - Result run(Options const &options); - - -}; - diff --git a/legacy/planar_complex.cu~ b/legacy/planar_complex.cu~ deleted file mode 100644 index db94a64..0000000 --- a/legacy/planar_complex.cu~ +++ /dev/null @@ -1,85 +0,0 @@ -/* -#include -#include -#include -#include - -int main() { - - cutlass::half_t x = 2.25_hf; - - std::cout << x << std::endl; - - return 0; -} -*/ - -#include -#include - -#include - -int main() { - - // Define the GEMM operation - using Gemm = cutlass::gemm::device::Gemm< - cutlass::half_t, // ElementA - cutlass::layout::ColumnMajor, // LayoutA - cutlass::half_t, // ElementB - cutlass::layout::ColumnMajor, // LayoutB - cutlass::half_t, // ElementOutput - cutlass::layout::ColumnMajor, // LayoutOutput - float, // ElementAccumulator - cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores - cutlass::arch::Sm75 // tag indicating target GPU compute architecture - >; - - Gemm gemm_op; - cutlass::Status status; - - // - // Define the problem size - // - int M = 512; - int N = 256; - int K = 128; - - float alpha = 1.25f; - float beta = -1.25f; - - // - // Allocate device memory - // - - cutlass::HostTensor A({M, K}); - cutlass::HostTensor B({K, N}); - cutlass::HostTensor C({M, N}); - - cutlass::half_t const *ptrA = A.device_data(); - cutlass::half_t const *ptrB = B.device_data(); - cutlass::half_t const *ptrC = C.device_data(); - cutlass::half_t *ptrD = C.device_data(); - - int lda = A.device_ref().stride(0); - int ldb = B.device_ref().stride(0); - int ldc = C.device_ref().stride(0); - int ldd = C.device_ref().stride(0); - // - // Launch GEMM on the device - // - - status = gemm_op({ - {M, N, K}, - {ptrA, lda}, // TensorRef to A device tensor - {ptrB, ldb}, // TensorRef to B device tensor - {ptrC, ldc}, // TensorRef to C device tensor - {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C - {alpha, beta} // epilogue operation arguments - }); - - if (status != cutlass::Status::kSuccess) { - return -1; - } - - return 0; -} From 0cad89c35b03f5b7ee01b672da07132a45cfc92f Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 22:50:15 -0700 Subject: [PATCH 13/30] Remove CPM from CMakeLists --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d4328d9..451d6fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,6 @@ set(CMAKE_CXX_FLAGS "-pthread") # add a directory for cmake modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") -include(cmake/CPM.cmake) # DSA_XENGINE may be built to run using CUDA. Future version may be # written for HIP or SYCL, which we call the From 03b30e9cee9a863e18e7a2a622b27d6e6934d525 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 15 Jun 2024 23:17:10 -0700 Subject: [PATCH 14/30] clean up, add utils to install --- CMakeLists.txt | 3 +- include/dsaX.h | 82 ++++++++++++++++++++++ src/dsaX_bfCorr.cu | 171 +++++++-------------------------------------- 3 files changed, 108 insertions(+), 148 deletions(-) create mode 100644 include/dsaX.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 451d6fe..4d276ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,8 +168,9 @@ else() find_package(xGPU REQUIRED) endif() -# Add src, legacy +# Add src, utils, legacy add_subdirectory(src) +add_subdirectory(utils) option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF) if(DSA_XENGINE_BUILD_LEGACY) add_subdirectory(legacy) diff --git a/include/dsaX.h b/include/dsaX.h new file mode 100644 index 0000000..c7afcb0 --- /dev/null +++ b/include/dsaX.h @@ -0,0 +1,82 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ +#include +#include +using std::cout; +using std::cerr; +using std::endl; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" + +#include +#include "cuda_fp16.h" +#include +#include + +// define structure that carries around device memory +typedef struct dmem { + + // initial data and streams + char * h_input; // host input pointer + char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + + // correlator pointers + // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times] + half * d_r, * d_i; + // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] + half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi; + // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] + float * d_output; + + // beamformer pointers + char * d_big_input; + half * d_br, * d_bi; + half * weights_r, * weights_i; //weights: [arm, tactp, b] + half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b] + unsigned char * d_bigpower; //output: [b, tc] + float * d_scf; // scale factor per beam + float * d_chscf; + float * h_winp; + int * flagants, nflags; + float * h_freqs, * d_freqs; + + // timing + float cp, prep, cubl, outp; + +} dmem; + +// Initialise device memory +void initialize(dmem * d, int bf); + +// Deallocate device memory +void deallocate(dmem * d, int bf); + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); + +// ? +int dada_bind_thread_to_core(int core); diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu index 25b9262..e992394 100644 --- a/src/dsaX_bfCorr.cu +++ b/src/dsaX_bfCorr.cu @@ -32,6 +32,7 @@ using std::endl; #include "dada_affinity.h" #include "ascii_header.h" #include "dsaX_def.h" +#include "dsaX.h" #include #include "cuda_fp16.h" @@ -47,39 +48,6 @@ using std::endl; /* global variables */ int DEBUG = 1; -// define structure that carries around device memory -typedef struct dmem { - - // initial data and streams - char * h_input; // host input pointer - char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - // correlator pointers - // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times] - half * d_r, * d_i; - // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] - half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi; - // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] - float * d_output; - - // beamformer pointers - char * d_big_input; - half * d_br, * d_bi; - half * weights_r, * weights_i; //weights: [arm, tactp, b] - half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b] - unsigned char * d_bigpower; //output: [b, tc] - float * d_scf; // scale factor per beam - float * d_chscf; - float * h_winp; - int * flagants, nflags; - float * h_freqs, * d_freqs; - - // timing - float cp, prep, cubl, outp; - -} dmem; - - // allocate device memory void initialize(dmem * d, int bf) { @@ -161,9 +129,6 @@ void deallocate(dmem * d, int bf) { } -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); -int dada_bind_thread_to_core (int core); - void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) { @@ -181,7 +146,6 @@ void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) } - void usage() { fprintf (stdout, @@ -211,64 +175,11 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) { } - -// arbitrary transpose kernel -// assume breakdown into tiles of 32x32, and run with 32x8 threads per block -// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) -// here, width is the dimension of the fastest index -__global__ void transpose_matrix_char(char * idata, char * odata) { - - __shared__ char tile[32][33]; - - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; - - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - - __syncthreads(); - - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; - - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - -} - -// arbitrary transpose kernel -// assume breakdown into tiles of 32x32, and run with 32x8 threads per block -// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) -// here, width is the dimension of the fastest index -__global__ void transpose_matrix_float(half * idata, half * odata) { - - __shared__ half tile[32][33]; - - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; - - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - - __syncthreads(); - - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; - - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - -} - -// arbitrary transpose kernel +// transpose kernel // assume breakdown into tiles of 32x32, and run with 32x8 threads per block // launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) // here, width is the dimension of the fastest index -template __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) { +template __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { __shared__ in_prec tile[32][33]; @@ -300,43 +211,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) { // transpose input data dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix_char<<>>(input,tx); - /* - // set up for geam - cublasHandle_t cublasH = NULL; - cudaStream_t stream = NULL; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - cublasSetStream(cublasH, stream); - - // transpose input matrix into tx - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - const int m = NPACKETS_PER_BLOCK * NANTS; - const int n = NCHAN_PER_PACKET*2*2/8; // columns in output - const double alpha = 1.0; - const double beta = 0.0; - const int lda = n; - const int ldb = m; - const int ldc = ldb; - cublasDgeam(cublasH,transa,transb,m,n, - &alpha,(double *)(input), - lda,&beta,(double *)(tx), - ldb,(double *)(tx),ldc); - */ - // now we just need to fluff to half-precision + transpose_matrix<<>>(input,tx); corr_input_copy<<>>(tx,inr,ini); - - // look at output - /*char * odata = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2); - cudaMemcpy(odata,inr,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,cudaMemcpyDeviceToHost); - FILE *fout; - fout=fopen("test.test","wb"); - fwrite(odata,1,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,fout); - fclose(fout);*/ - - // destroy stream - //cudaStreamDestroy(stream); - } // kernel to help with reordering output @@ -377,8 +253,8 @@ void reorder_output(dmem * d) { // transpose input data dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32); - transpose_matrix_float<<>>(d->d_outr,d->d_tx_outr); - transpose_matrix_float<<>>(d->d_outi,d->d_tx_outi); + transpose_matrix<<>>(d->d_outr,d->d_tx_outr); + transpose_matrix<<>>(d->d_outi,d->d_tx_outi); // look at output /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); @@ -617,7 +493,7 @@ __global__ void sum_beam(unsigned char * input, float * output) { __shared__ float summ[512]; int bidx = blockIdx.x; int tidx = threadIdx.x; - int idx = bidx*256+tidx; + //int idx = bidx*256+tidx; int bm = (int)(bidx/48); int ch = (int)(bidx % 48); @@ -675,7 +551,7 @@ void dbeamformer(dmem * d) { const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2; const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; const int batchCount = NCHAN_PER_PACKET/8; - long long int i1, i2, o1; + long long int i1, i2;//, o1; // create streams cudaStream_t stream; @@ -790,13 +666,13 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); int bm = (int)(idx / (128*(NANTS/2))); int tactp = (int)(idx % (128*(NANTS/2))); - int t = (int)(tactp / (32*(NANTS/2))); + //int t = (int)(tactp / (32*(NANTS/2))); int actp = (int)(tactp % (32*(NANTS/2))); int a = (int)(actp / 32); int ctp = (int)(actp % 32); - int c = (int)(ctp / 4); + //int c = (int)(ctp / 4); int tp = (int)(ctp % 4); - int t2 = (int)(tp / 2); + //int t2 = (int)(tp / 2); int pol = (int)(tp % 2); int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; @@ -843,18 +719,19 @@ void calc_weights(dmem * d) { cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); // deal with antpos and calibs - int iant, found; + //int iant; + //int found; for (int i=0;ih_winp[2*i]; antpos_n[i] = d->h_winp[2*i+1]; } for (int i=0;inflags;j++) - if (d->flagants[j]==iant) found = 1; + // DEBUG CODE? + //iant = (int)(i/((NCHAN_PER_PACKET/8)*2)); + //found = 0; + //for (int j=0;jnflags;j++) + //if (d->flagants[j]==iant) found = 1; calibs[2*i] = d->h_winp[2*NANTS+2*i]; calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1]; @@ -1087,7 +964,7 @@ int main (int argc, char *argv[]) { // test mode FILE *fin, *fout; uint64_t output_size; - char * output_data, * o1; + char * output_data;//, * o1; if (test) { // read one block of input data @@ -1135,7 +1012,7 @@ int main (int argc, char *argv[]) { // free free(d.h_input); free(output_data); - free(o1); + //free(o1); deallocate(&d,bf); exit(1); @@ -1213,18 +1090,18 @@ int main (int argc, char *argv[]) { else syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); uint64_t bytes_read = 0; - char * block; + //char * block; char * output_buffer; output_buffer = (char *)malloc(block_out); uint64_t written, block_id; // get things started bool observation_complete=0; - bool started = 0; + //bool started = 0; syslog(LOG_INFO, "starting observation"); int blocks = 0; - clock_t begin, end; - double time_spent; + //clock_t begin, end; + //double time_spent; while (!observation_complete) { From e260867d558da934b996c25504c79b5d9c7bc1d7 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Fri, 21 Jun 2024 16:24:16 -0700 Subject: [PATCH 15/30] Add more dependencies, clean CMake defaults to cuBLAS only --- CMakeLists.txt | 220 +++++++++++++++++++++++++++++++-------------- README.md | 4 + include/dsaX.h | 44 +++++++-- src/CMakeLists.txt | 62 +++++++------ src/dsaX_bfCorr.cu | 9 +- 5 files changed, 231 insertions(+), 108 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d276ea..2da1445 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,45 +25,21 @@ set(CMAKE_CXX_FLAGS "-pthread") # add a directory for cmake modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") -# DSA_XENGINE may be built to run using CUDA. Future version may be +# DSA_XENGINE may be built to run using CUDA or CPU. Future version may be # written for HIP or SYCL, which we call the # Target type. By default, the target is CUDA. +#--------------------------------------------- + +# Set by environment variable if visible if(DEFINED ENV{DSA_XENGINE_TARGET}) set(DEFTARGET $ENV{DSA_XENGINE_TARGET}) else() set(DEFTARGET "CUDA") endif() -set(VALID_TARGET_TYPES CUDA) #HIP SYCL -set(DSA_XENGINE_TARGET_TYPE - "${DEFTARGET}" - CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") -set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS CUDA) - -# CUDA specific part of CMakeLists -#set(CMAKE_CUDA_EXTENSIONS OFF) -find_package(CUDAToolkit REQUIRED) - -if(DEFINED ENV{DSA_XENGINE_GPU_ARCH}) - set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH}) -else() - set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_70) -endif() -if(NOT DSA_XENGINE_GPU_ARCH) - message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}") -endif() - -set(DSA_XENGINE_GPU_ARCH - ${DSA_XENGINE_DEFAULT_GPU_ARCH} - CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)") -set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90) -set(DSA_XENGINE_GPU_ARCH_SUFFIX - "" - CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") -set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ") -#set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH}) -#mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) -#mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) +set(VALID_TARGET_TYPES CUDA CPU) #HIP SYCL +set(DSA_XENGINE_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") +set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS "CUDA" "CPU") # HIP SYCL string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE) list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) @@ -72,7 +48,8 @@ if(TARGET_TYPE_VALID LESS 0) message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") endif() -# Git +# Git helpers +#------------ find_package(Git) if(GIT_FOUND) execute_process( @@ -106,66 +83,173 @@ endif(GIT_FOUND) # Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch) include(ExternalProject) -# Get TCC dependency -option(DSA_XENGINE_USE_TCC "Use TensorCoreCorrelators for correlatorss" ON) -if(DSA_XENGINE_USE_TCC) - option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF) - if(DSA_XENGINE_DOWNLOAD_TCC) - ExternalProject_Add(TCC - GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator - #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c - ) +# Use FetchContent for lightweight dependencies +include(FetchContent) + +# CUDA based dependencies and options +#------------------------------------ +if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) + + # CUDA specific part of CMakeLists + #set(CMAKE_CUDA_EXTENSIONS OFF) + find_package(CUDAToolkit REQUIRED) + + # Get GPU architecture from environmen, or set default (sm_80) + if(DEFINED ENV{DSA_XENGINE_GPU_ARCH}) + set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH}) else() - find_package(libtcc REQUIRED) + set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_80) endif() -endif() -# Get CUTLASS dependency -option(DSA_XENGINE_USE_CUTLASS "Use CUTLASS for GEMMs" ON) -if(DSA_XENGINE_USE_CUTLASS) - option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF) - if(DSA_XENGINE_DOWNLOAD_CUTLASS) - # Custom CUTLASS build - ExternalProject_Add(NvidiaCutlass - GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git - GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc + if(NOT DSA_XENGINE_GPU_ARCH) + message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}") + endif() + + set(DSA_XENGINE_GPU_ARCH + ${DSA_XENGINE_DEFAULT_GPU_ARCH} + CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)") + set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90) + set(DSA_XENGINE_GPU_ARCH_SUFFIX + "" + CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") + set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ") + #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH}) + #mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX) + #mark_as_advanced(CMAKE_CUDA_ARCHITECTURES) + + # Set CUDA based methods and dependencies + #---------------------------------------- + + # This is the default GPU method + option(DSA_XENGINE_ENABLE_CUBLAS "Use cuBLAS for correlatorss" ON) + + # All other GPU methods can be enabled at compile time and + # toggled for use at run time, if enabled. + + # Get TCC dependency + option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF) + if(DSA_XENGINE_ENABLE_TCC) + option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF) + if(DSA_XENGINE_DOWNLOAD_TCC) + ExternalProject_Add(TCC + GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator + #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c + CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(libtcc REQUIRED) + endif() + endif() + + # Get CUTLASS dependency + option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF) + if(DSA_XENGINE_ENABLE_CUTLASS) + option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF) + if(DSA_XENGINE_DOWNLOAD_CUTLASS) + # Custom CUTLASS build + ExternalProject_Add(NvidiaCutlass + GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git + GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc + CMAKE_ARGS + "-DCUTLASS_NVCC_ARCHS_ENABLED=89" + "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(NvidiaCutlass REQUIRED) + endif() + endif() + + # Get MAGMA dependency + option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF) + if(DSA_XENGINE_ENABLE_MAGMA) + option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF) + if(DSA_XENGINE_DOWNLOAD_MAGMA) + # Custom MAGMA build + ExternalProject_Add(Magma + URL https://icl.utk.edu/projectsfiles/magma/downloads/magma-2.8.0.tar.gz + CMAKE_ARGS + "-DMAGMA_ENABLE_CUDA=ON" + "-DGPU_TARGET=sm_80" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + find_package(Magma REQUIRED) + endif() + endif() + + # Get XGPU dependency (fix install) + option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF) + if(DSA_XENGINE_ENABLE_XGPU) + option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF) + if(DSA_XENGINE_DOWNLOAD_XGPU) + # Download, build and install + FetchContent_Declare( + xGPU + GIT_REPOSITORY https://github.com/cpviolator/xGPU.git + #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 + ) + FetchContent_MakeAvailable(XGPU) + else() + # Find and link to local install + find_package(xGPU REQUIRED) + endif() + endif() + +endif() # CUDA functionality + +# Get CPU based dependencies +# Get OPENBLAS dependency +option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF) +if(DSA_XENGINE_ENABLE_OPENBLAS) + option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF) + if(DSA_XENGINE_DOWNLOAD_OPENBLAS) + # Custom OPENBLAS build + ExternalProject_Add(Openblas + GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git + GIT_TAG ce3f668 CMAKE_ARGS - "-DCUTLASS_NVCC_ARCHS_ENABLED=89" - "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex" + #"-DOPENBLAS_ENABLE_CUDA=ON" + #"-DGPU_TARGET=sm_80" "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" ) else() - find_package(NvidiaCutlass REQUIRED) + find_package(Openblas REQUIRED) endif() endif() -# Use FetchContent for lightweight dependencies -include(FetchContent) - # Get psrdada dependency +option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for correlatorss" ON) option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON) -if(DSA_XENGINE_DOWNLOAD_PSRDADA) +if(DSA_XENGINE_DOWNLOAD_PSRDADA) + # Download, build and install FetchContent_Declare( PSRDada GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code - #GIT_TAG 008afa70393ae2df11efba0cc8d0b95cda599c02 ) FetchContent_MakeAvailable(PSRDada) else() + # Find and link to local install find_package(psrdada REQUIRED) endif() -# Get XGPU dependency -option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON) -if(DSA_XENGINE_DOWNLOAD_XGPU) +# Get command line util + +# Get cli11 dependency +# FIX ME: het static .hpp version and ship with package +option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON) +option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build Cli11" ON) +if(DSA_XENGINE_DOWNLOAD_CLI11) + # Download, build and install FetchContent_Declare( - xGPU - GIT_REPOSITORY https://github.com/cpviolator/xGPU.git - #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2 + CLI11 + GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git + GIT_TAG main ) - FetchContent_MakeAvailable(XGPU) + FetchContent_MakeAvailable(CLI11) else() - find_package(xGPU REQUIRED) + # Find and link to local install + find_package(CLI11 REQUIRED) endif() # Add src, utils, legacy diff --git a/README.md b/README.md index 03fe5e3..4a27ba5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # dsa110-xengine +

+ GitHub last commit + GitHub commit activity the past week +

This repo contains code used for the DSA X-engine. The requirements are to: - capture SNAP F-engine packets on an ethernet interface, and place them in a psrdada buffer diff --git a/include/dsaX.h b/include/dsaX.h index c7afcb0..2fe6246 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -1,15 +1,10 @@ -// -*- c++ -*- -/* assumes input and output block size is appropriate - will seg fault otherwise*/ -/* -Workflow is similar for BF and corr applications - - copy data to GPU, convert to half-precision and calibrate while reordering - - do matrix operations to populate large output vector - */ #include #include +#include using std::cout; using std::cerr; using std::endl; +#include #include #include #include @@ -32,6 +27,7 @@ using std::endl; #include "dada_affinity.h" #include "ascii_header.h" #include "dsaX_def.h" +#include "dsaX_enums.h" #include #include "cuda_fp16.h" @@ -70,8 +66,40 @@ typedef struct dmem { } dmem; +// Structure that carries BLAS parameters +typedef struct dsaBLASParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaBLASType blas_type; /**< Type of BLAS computation to perfrom */ + + // GEMM params + dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + dsaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + int m; /**< number of rows of matrix op(A) and C. */ + int n; /**< number of columns of matrix op(B) and C. */ + int k; /**< number of columns of op(A) and rows of op(B). */ + int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ + int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ + int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ + int a_offset; /**< position of the A array from which begin read/write. */ + int b_offset; /**< position of the B array from which begin read/write. */ + int c_offset; /**< position of the C array from which begin read/write. */ + int a_stride; /**< stride of the A array in strided(batched) mode */ + int b_stride; /**< stride of the B array in strided(batched) mode */ + int c_stride; /**< stride of the C array in strided(batched) mode */ + std::complex alpha; /**< scalar used for multiplication. */ + std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + + // Common params + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + dsaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} dsaBLASParam; + + // Initialise device memory -void initialize(dmem * d, int bf); +void initialize_device_memeory(dmem * d, int bf); // Deallocate device memory void deallocate(dmem * d, int bf); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 748f00b..aabd03f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,32 +1,40 @@ enable_language(CUDA) -include_directories(..//include) -include_directories(${PSRDada_SOURCE_DIR}/src) -include_directories(${xGPU_SOURCE_DIR}/src) -include_directories(${NvidiaCutlass_DIR}/../../../include) -include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) - -set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) -set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) -set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) - -# Some simple CUTLASS examples to test linking/benching -#------------------------------------------------------ -add_executable(planar_complex planar_complex.cu) -target_link_libraries(planar_complex ${NvidiaCutlass_LIB}) - -add_executable(10_planar_complex 10_planar_complex.cu) -target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB}) - -add_executable(11_planar_complex_array 11_planar_complex_array.cu) -target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB}) -#------------------------------------------------------ - -# DSA Fast Time Domain CUTLASS interface -#--------------------------------------- -add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu) -target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB}) -#--------------------------------------- +include_directories(../include) + +if(DSA_XENGINE_ENABLE_PSRDADA) + include_directories(${PSRDada_SOURCE_DIR}/src) + set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) +endif() + +if(DSA_XENGINE_ENABLE_XGPU) + include_directories(${xGPU_SOURCE_DIR}/src) + set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) +endif() + +if(DSA_XENGINE_ENABLE_CUTLASS) + include_directories(${NvidiaCutlass_DIR}/../../../include) + include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) + set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) + + # Some simple CUTLASS examples to test linking/benching + #------------------------------------------------------ + add_executable(planar_complex planar_complex.cu) + target_link_libraries(planar_complex ${NvidiaCutlass_LIB}) + + add_executable(10_planar_complex 10_planar_complex.cu) + target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB}) + + add_executable(11_planar_complex_array 11_planar_complex_array.cu) + target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB}) + #------------------------------------------------------ + + # DSA Fast Time Domain CUTLASS interface + #--------------------------------------- + add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu) + target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB}) + #--------------------------------------- +endif() # DSA Fast Time Domain #--------------------- diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu index e992394..d1b7a70 100644 --- a/src/dsaX_bfCorr.cu +++ b/src/dsaX_bfCorr.cu @@ -49,7 +49,7 @@ using std::endl; int DEBUG = 1; // allocate device memory -void initialize(dmem * d, int bf) { +void initialize_device_memory(dmem * d, int bf) { // for correlator if (bf==0) { @@ -337,7 +337,7 @@ void dcorrelator(dmem * d) { cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice); // reorder input - reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i); + reorder_input(d->d_input, d->d_tx, d->d_r, d->d_i); // not sure if essential cudaDeviceSynchronize(); @@ -771,7 +771,7 @@ void calc_weights(dmem * d) { int main (int argc, char *argv[]) { - cudaSetDevice(1); + cudaSetDevice(0); // startup syslog message // using LOG_LOCAL0 @@ -793,7 +793,6 @@ int main (int argc, char *argv[]) { int test = 0; char ftest[200], fflagants[200], fcalib[200]; float sfreq = 1498.75; - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) { @@ -927,7 +926,7 @@ int main (int argc, char *argv[]) { // allocate device memory dmem d; - initialize(&d,bf); + initialize_device_memory(&d,bf); // set up for beamformer FILE *ff; From 0656729e526569115a94f4c85cb1b3a940cd1733 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Fri, 21 Jun 2024 17:40:51 -0700 Subject: [PATCH 16/30] Add test directory, googletest, rename file --- CMakeLists.txt | 30 +++++++++++++++---- src/CMakeLists.txt | 26 ++++++++-------- ...fCorr.cu => dsaX_beamformer_correlator.cu} | 0 3 files changed, 37 insertions(+), 19 deletions(-) rename src/{dsaX_bfCorr.cu => dsaX_beamformer_correlator.cu} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2da1445..f29ca32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,12 +233,10 @@ else() find_package(psrdada REQUIRED) endif() -# Get command line util - -# Get cli11 dependency -# FIX ME: het static .hpp version and ship with package +# Get CLI11 dependency +# FIX ME: get static .hpp version and ship with package option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON) -option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build Cli11" ON) +option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON) if(DSA_XENGINE_DOWNLOAD_CLI11) # Download, build and install FetchContent_Declare( @@ -252,9 +250,29 @@ else() find_package(CLI11 REQUIRED) endif() -# Add src, utils, legacy +# Get CLI11 dependency +# FIX ME: get static .hpp version and ship with package +option(DSA_XENGINE_ENABLE_GOOGLETEST "Enable GOOGLETEST (required)" ON) +option(DSA_XENGINE_DOWNLOAD_GOOGLETEST "Download and build GOOGLETEST" ON) +if(DSA_XENGINE_DOWNLOAD_GOOGLETEST) + # Download, build and install + FetchContent_Declare( + GOOGLETEST + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG main + ) + FetchContent_MakeAvailable(GOOGLETEST) +else() + # Find and link to local install + find_package(GOOGLETEST REQUIRED) +endif() + + +# Add src, utils, include, tests, and legacy add_subdirectory(src) add_subdirectory(utils) +add_subdirectory(include) +add_subdirectory(tests) option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF) if(DSA_XENGINE_BUILD_LEGACY) add_subdirectory(legacy) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aabd03f..290b414 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,28 +38,28 @@ endif() # DSA Fast Time Domain #--------------------- -add_executable(dsaX_bfCorr dsaX_bfCorr.cu) -target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +add_library(dsa OBJECT dsaX_beamformer_correlator.cu) + +add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu) +target_link_libraries(dsaX_beamformer_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) #--------------------- -# install step for header files -#------------------------------ -set(DSA_XENGINE_HEADERS +# install step for libraray +#----------------------------- +install(TARGETS # cmake-format: sortable - dsaX_capture.h - dsaX_capture_manythread.h - dsaX_capture_pcap.h - dsaX_def.h - dsaX_cutlass_interface.h + dsa + LIBRARY DESTINATION + lib ) -install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) -#------------------------------ +#----------------------------- + # install step for executables #----------------------------- install(TARGETS # cmake-format: sortable - dsaX_bfCorr + dsaX_beamformer_correlator RUNTIME DESTINATION bin ) diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_beamformer_correlator.cu similarity index 100% rename from src/dsaX_bfCorr.cu rename to src/dsaX_beamformer_correlator.cu From a50da91b41124c0ddfc7fb13e2ff6822e45561b3 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Fri, 21 Jun 2024 20:55:21 -0700 Subject: [PATCH 17/30] Restructure headers, create dsalib, move executable out to tests --- include/dsaX.h | 29 +- include/dsaX_def.h | 6 +- src/CMakeLists.txt | 25 +- src/dsaX_beamformer_correlator.cu | 586 ++---------------------------- 4 files changed, 68 insertions(+), 578 deletions(-) diff --git a/include/dsaX.h b/include/dsaX.h index 2fe6246..ddd351a 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -1,9 +1,8 @@ +#pragma once + #include #include #include -using std::cout; -using std::cerr; -using std::endl; #include #include #include @@ -18,21 +17,17 @@ using std::endl; #include #include -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" -#include "dsaX_def.h" -#include "dsaX_enums.h" +#include "dsaX_cuda_headers.h" +#include "dsaX_psrdada_headers.h" + +// required to prevent overflow in corr matrix multiply +#define halfFac 4 + +// beam sep +#define sep 1.0 // arcmin -#include -#include "cuda_fp16.h" -#include -#include +/* global variables */ +//#define DEBUG; // define structure that carries around device memory typedef struct dmem { diff --git a/include/dsaX_def.h b/include/dsaX_def.h index 415e83b..c23ed15 100644 --- a/include/dsaX_def.h +++ b/include/dsaX_def.h @@ -1,5 +1,4 @@ -#ifndef __DSAX_DEF_H -#define __DSAX_DEF_H +#pragma once #include "dada_def.h" @@ -93,6 +92,3 @@ #define NBMS 256 #define P_SIZE 4108 #define NWAIT 100000 - -#endif - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 290b414..a55c8fe 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,21 +2,32 @@ enable_language(CUDA) include_directories(../include) +# DSA Fast Time Domain library +#----------------------------- +add_library(dsa SHARED dsaX_cuda_interface.cu dsaX_blas_interface.cu dsaX_beamformer_correlator.cu) + +if(CUDAToolkit_FOUND) + target_link_libraries(dsa CUDA::cudart) +endif() + if(DSA_XENGINE_ENABLE_PSRDADA) include_directories(${PSRDada_SOURCE_DIR}/src) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) + target_link_libraries(dsa ${PSRDada_LIB}) endif() if(DSA_XENGINE_ENABLE_XGPU) include_directories(${xGPU_SOURCE_DIR}/src) set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) + target_link_libraries(dsa PUBLIC ${XGPU_LIB}) endif() if(DSA_XENGINE_ENABLE_CUTLASS) include_directories(${NvidiaCutlass_DIR}/../../../include) include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) - + target_link_libraries(dsa PUBLIC ${NvidiaCutlass_LIB}) + # Some simple CUTLASS examples to test linking/benching #------------------------------------------------------ add_executable(planar_complex planar_complex.cu) @@ -36,12 +47,10 @@ if(DSA_XENGINE_ENABLE_CUTLASS) #--------------------------------------- endif() -# DSA Fast Time Domain -#--------------------- -add_library(dsa OBJECT dsaX_beamformer_correlator.cu) - -add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu) -target_link_libraries(dsaX_beamformer_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +if(CUDAToolkit_FOUND) + #add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu) + #target_link_libraries(dsaX_beamformer_correlator ${dsa} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +endif() #--------------------- # install step for libraray @@ -59,7 +68,7 @@ install(TARGETS #----------------------------- install(TARGETS # cmake-format: sortable - dsaX_beamformer_correlator + #dsaX_beamformer_correlator RUNTIME DESTINATION bin ) diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu index d1b7a70..c91c1b7 100644 --- a/src/dsaX_beamformer_correlator.cu +++ b/src/dsaX_beamformer_correlator.cu @@ -5,340 +5,61 @@ Workflow is similar for BF and corr applications - copy data to GPU, convert to half-precision and calibrate while reordering - do matrix operations to populate large output vector */ -#include -#include -using std::cout; -using std::cerr; -using std::endl; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "dada_client.h" -#include "dada_def.h" -#include "dada_hdu.h" -#include "multilog.h" -#include "ipcio.h" -#include "ipcbuf.h" -#include "dada_affinity.h" -#include "ascii_header.h" #include "dsaX_def.h" #include "dsaX.h" +#include "dsaX_blas_interface.h" -#include -#include "cuda_fp16.h" -#include -#include +//#include +//#include "cuda_fp16.h" +//#include +//#include -// required to prevent overflow in corr matrix multiply -#define halfFac 4 +#include "dsaX_cuda_interface.h" -// beam sep -#define sep 1.0 // arcmin - -/* global variables */ int DEBUG = 1; -// allocate device memory -void initialize_device_memory(dmem * d, int bf) { - - // for correlator - if (bf==0) { - cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - } - - // for beamformer - if (bf==1) { - cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); - cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor - cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor - - // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] - d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); - d->flagants = (int *)malloc(sizeof(int)*NANTS); - d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); - - // timers - d->cp = 0.; - d->prep = 0.; - d->outp = 0.; - d->cubl = 0.; - - } - -} - -// deallocate device memory -void deallocate(dmem * d, int bf) { - - cudaFree(d->d_input); - - if (bf==0) { - cudaFree(d->d_r); - cudaFree(d->d_i); - cudaFree(d->d_tx); - cudaFree(d->d_output); - cudaFree(d->d_outr); - cudaFree(d->d_outi); - cudaFree(d->d_tx_outr); - cudaFree(d->d_tx_outi); - } - if (bf==1) { - cudaFree(d->d_tx); - cudaFree(d->d_br); - cudaFree(d->d_bi); - cudaFree(d->weights_r); - cudaFree(d->weights_i); - cudaFree(d->d_bigbeam_r); - cudaFree(d->d_bigbeam_i); - cudaFree(d->d_bigpower); - cudaFree(d->d_scf); - cudaFree(d->d_chscf); - free(d->h_winp); - free(d->flagants); - cudaFree(d->d_freqs); - free(d->h_freqs); - } - -} - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out) +void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out) { - - if (dada_hdu_unlock_read (in) < 0) - { - syslog(LOG_ERR, "could not unlock read on hdu_in"); - } + if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in"); dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) - { - syslog(LOG_ERR, "could not unlock write on hdu_out"); - } - dada_hdu_destroy (out); - -} - -void usage() -{ -fprintf (stdout, - "dsaX_bfCorr [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -b run beamformer [default is to run correlator]\n" - " -h print usage\n" - " -t binary file for test mode\n" - " -f flagants file\n" - " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); -} - -// kernel to fluff input -// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks -__global__ void corr_input_copy(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 - int tidx = threadIdx.x; // assume 128 - int iidx = bidx*128+tidx; - - inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); - ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); - -} - -// transpose kernel -// assume breakdown into tiles of 32x32, and run with 32x8 threads per block -// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) -// here, width is the dimension of the fastest index -template __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { - - __shared__ in_prec tile[32][33]; - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; - - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - - __syncthreads(); - - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; - - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - -} - - -// function to copy and reorder d_input to d_r and d_i -// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] -// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] -// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. -// then fluffs using simple kernel -void reorder_input(char *input, char * tx, half *inr, half *ini) { - - // transpose input data - dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix<<>>(input,tx); - corr_input_copy<<>>(tx,inr,ini); -} - -// kernel to help with reordering output -// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] -// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads -__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { - - int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 - int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; - - int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); - int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); - int ch = (int)(chpol / 2); - int base_idx = indices_lookup[baseline]; - int iidx = base_idx * NCHAN_PER_PACKET + ch; - int pol = (int)(chpol % 2); - - float v1=0., v2=0.; - - for (int i=0;i>>(d->d_outr,d->d_tx_outr); - transpose_matrix<<>>(d->d_outi,d->d_tx_outi); - - // look at output - /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); - cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); - FILE *fout; - fout=fopen("test2.test","wb"); - fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout); - fclose(fout);*/ - - - /* - // set up for geam - cublasHandle_t cublasH = NULL; - cudaStream_t stream = NULL; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - cublasSetStream(cublasH, stream); - - // transpose output matrices into tx_outr and tx_outi - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - const int m = NCHAN_PER_PACKET*2*2; - const int n = NANTS*NANTS/16; // columns in output - const double alpha = 1.0; - const double beta = 0.0; - const int lda = n; - const int ldb = m; - const int ldc = ldb; - cublasDgeam(cublasH,transa,transb,m,n, - &alpha,(double *)(d->d_outr), - lda,&beta,(double *)(d->d_tx_outr), - ldb,(double *)(d->d_tx_outr),ldc); - cublasDgeam(cublasH,transa,transb,m,n, - &alpha,(double *)(d->d_outi), - lda,&beta,(double *)(d->d_tx_outi), - ldb,(double *)(d->d_tx_outi),ldc); - */ - // now run kernel to sum into output - int * h_idxs = (int *)malloc(sizeof(int)*NBASE); - int * d_idxs; - cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE); - int ii = 0; - // upper triangular order (column major) to match xGPU (not the same as CASA!) - for (int i=0;i>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs); - - /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); - cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); - FILE *fout; - fout=fopen("test3.test","wb"); - fwrite(odata,sizeof(char),384*4*NBASE*4,fout); - fclose(fout);*/ - + if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out"); + dada_hdu_destroy (out); - cudaFree(d_idxs); - free(h_idxs); - //cudaStreamDestroy(stream); +} +void usage() { + fprintf (stdout, + "dsaX_beamformer_correlator [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -b run beamformer [default is to run correlator]\n" + " -h print usage\n" + " -t binary file for test mode\n" + " -f flagants file\n" + " -a calib file\n" + " -s start frequency (assumes -0.244140625MHz BW)\n"); } - - // correlator function // workflow: copy to device, reorder, stridedBatchedGemm, reorder -void dcorrelator(dmem * d) { +void dcorrelator(dmem *d) { // zero out output arrays - cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + cudaMemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + cudaMemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + cudaMemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); // copy to device - cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice); + cudaMemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, cudaMemcpyHostToDevice); // reorder input - reorder_input(d->d_input, d->d_tx, d->d_r, d->d_i); + reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i); + // ABSTRACT HERE START // not sure if essential cudaDeviceSynchronize(); @@ -401,120 +122,10 @@ void dcorrelator(dmem * d) { // destroy stream cudaStreamDestroy(stream); cublasDestroy(cublasH); + // ABSTRACT HERE END // reorder output data - reorder_output(d); - -} - -// kernels to reorder and fluff input data for beamformer -// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] -// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex] // run as 16x16 tiled transpose with 32-byte words -// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16) -// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index -// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); -__global__ void transpose_input_bf(double * idata, double * odata) { - - __shared__ double tile[16][17][4]; - - int x = blockIdx.x * 16 + threadIdx.x; - int y = blockIdx.y * 16 + threadIdx.y; - int width = gridDim.x * 16; - - for (int j = 0; j < 16; j += 8) { - tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)]; - tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1]; - tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2]; - tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3]; - } - - __syncthreads(); - - x = blockIdx.y * 16 + threadIdx.x; // transpose block offset - y = blockIdx.x * 16 + threadIdx.y; - width = gridDim.y * 16; - - for (int j = 0; j < 16; j += 8) { - odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0]; - odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1]; - odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2]; - odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3]; - } - -} - -// kernel to fluff input bf data -// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads -__global__ void fluff_input_bf(char * input, half * dr, half * di) { - - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 - int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; - - dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); - di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); - -} - -// transpose, add and scale kernel for bf -// assume breakdown into tiles of 16x16, and run with 16x8 threads per block -// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) -// scf is a per-beam scale factor to enable recasting as unsigned char -__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) { - - __shared__ float tile[16][17]; - - int x = blockIdx.x * 16 + threadIdx.x; - int y = blockIdx.y * 16 + threadIdx.y; - int width = gridDim.x * 16; - float dr, di; - - for (int j = 0; j < 16; j += 8) { - dr = (float)(ir[(y+j)*width + x]); - di = (float)(ii[(y+j)*width + x]); - tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); - } - - __syncthreads(); - - x = blockIdx.y * 16 + threadIdx.x; // transpose block offset - y = blockIdx.x * 16 + threadIdx.y; - width = gridDim.y * 16; - - for (int j = 0; j < 16; j += 8) - odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); - -} - -// sum over all times in output beam array -// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads -__global__ void sum_beam(unsigned char * input, float * output) { - - __shared__ float summ[512]; - int bidx = blockIdx.x; - int tidx = threadIdx.x; - //int idx = bidx*256+tidx; - int bm = (int)(bidx/48); - int ch = (int)(bidx % 48); - - summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); - - __syncthreads(); - - if (tidx<256) { - summ[tidx] += summ[tidx+256]; - summ[tidx] += summ[tidx+128]; - summ[tidx] += summ[tidx+64]; - summ[tidx] += summ[tidx+32]; - summ[tidx] += summ[tidx+16]; - summ[tidx] += summ[tidx+8]; - summ[tidx] += summ[tidx+4]; - summ[tidx] += summ[tidx+2]; - summ[tidx] += summ[tidx+1]; - } - - if (tidx==0) output[bidx] = summ[tidx]; - + reorder_output_device(d); } /* @@ -635,13 +246,9 @@ void dbeamformer(dmem * d) { transpose_scale_bf<<>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); end = clock(); d->outp += (float)(end - begin) / CLOCKS_PER_SEC; - - } cudaStreamDestroy(stream); - - cublasDestroy(cublasH); // form sum over times @@ -649,126 +256,9 @@ void dbeamformer(dmem * d) { } -// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] -// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads -__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { - - int bidx = blockIdx.x; - int tidx = threadIdx.x; - int inidx = bidx*128+tidx; - - // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) - - // get indices - int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); - int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); - int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2))); - int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); - int bm = (int)(idx / (128*(NANTS/2))); - int tactp = (int)(idx % (128*(NANTS/2))); - //int t = (int)(tactp / (32*(NANTS/2))); - int actp = (int)(tactp % (32*(NANTS/2))); - int a = (int)(actp / 32); - int ctp = (int)(actp % 32); - //int c = (int)(ctp / 4); - int tp = (int)(ctp % 4); - //int t2 = (int)(tp / 2); - int pol = (int)(tp % 2); - int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; - - // calculate weights - float theta, afac, twr, twi; - if (iArm==0) { - theta = sep*(127.-bm*1.)*PI/10800.; // radians - afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate - twr = cos(afac*antpos_e[a+48*iArm]); - twi = sin(afac*antpos_e[a+48*iArm]); - wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); - wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); - //wr[inidx] = __float2half(calibs[widx]); - //wi[inidx] = __float2half(calibs[widx+1]); - } - if (iArm==1) { - theta = sep*(127.-bm*1.)*PI/10800.; // radians - afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate - twr = cos(afac*antpos_n[a+48*iArm]); - twi = sin(afac*antpos_n[a+48*iArm]); - wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); - wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); - //wr[inidx] = __float2half(calibs[widx]); - //wi[inidx] = __float2half(calibs[widx+1]); - } - -} - -// GPU-powered function to populate weights matrix for beamformer -// file format: -// sequential pairs of eastings and northings -// then [NANTS, 48, R/I] calibs - -void calc_weights(dmem * d) { - - // allocate - float *antpos_e = (float *)malloc(sizeof(float)*NANTS); - float *antpos_n = (float *)malloc(sizeof(float)*NANTS); - float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); - float *d_antpos_e, *d_antpos_n, *d_calibs; - float wnorm; - cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS); - cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS); - cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2); - - // deal with antpos and calibs - //int iant; - //int found; - for (int i=0;ih_winp[2*i]; - antpos_n[i] = d->h_winp[2*i+1]; - } - for (int i=0;inflags;j++) - //if (d->flagants[j]==iant) found = 1; - - calibs[2*i] = d->h_winp[2*NANTS+2*i]; - calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1]; - - wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]); - if (wnorm!=0.0) { - calibs[2*i] /= wnorm; - calibs[2*i+1] /= wnorm; - } - - //if (found==1) { - //calibs[2*i] = 0.; - //calibs[2*i+1] = 0.; - //} - } - - //for (int i=0;i>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs); - - // free stuff - cudaFree(d_antpos_e); - cudaFree(d_antpos_n); - cudaFree(d_calibs); - free(antpos_e); - free(antpos_n); - free(calibs); - -} // MAIN - +#if 0 int main (int argc, char *argv[]) { cudaSetDevice(0); @@ -903,7 +393,7 @@ int main (int argc, char *argv[]) { return EXIT_FAILURE; } case 'd': - DEBUG=1; + //DEBUG=1; syslog (LOG_DEBUG, "Will excrete all debug messages"); break; case 'b': @@ -1012,7 +502,7 @@ int main (int argc, char *argv[]) { free(d.h_input); free(output_data); //free(o1); - deallocate(&d,bf); + deallocate_device_memory(&d,bf); exit(1); } @@ -1154,9 +644,9 @@ int main (int argc, char *argv[]) { // finish up free(output_buffer); - deallocate(&d,bf); + deallocate_device_memory(&d,bf); dsaX_dbgpu_cleanup (hdu_in, hdu_out); } - +#endif From 85555743eb1f7c3e223da7325f3c8d87b4cb774c Mon Sep 17 00:00:00 2001 From: cpviolator Date: Sat, 22 Jun 2024 01:02:25 -0700 Subject: [PATCH 18/30] Begin work on fully arch-independent correlator and beamformer, add a skeleton test suite --- CMakeLists.txt | 6 +- include/CMakeLists.txt | 15 + include/dsaX.h | 10 +- include/dsaX_beamformer_correlator.h | 9 + include/dsaX_blas_interface.h | 5 + include/dsaX_cublas_interface.h | 5 + include/dsaX_cuda_headers.h | 6 + include/dsaX_cuda_interface.h | 31 + include/dsaX_enums.h | 33 ++ include/dsaX_psrdada_headers.h | 12 + include/dsaX_utils.h | 9 + src/10_planar_complex.cu | 567 +++++++++++++++++++ src/11_planar_complex_array.cu | 628 +++++++++++++++++++++ src/CMakeLists.txt | 14 +- src/dsaX_beamformer_correlator.cu | 122 ++-- src/dsaX_blas_interface.cu | 11 + src/dsaX_cublas_interface.cu | 92 +++ src/dsaX_cuda_interface.cu | 467 +++++++++++++++ src/dsaX_utils.cpp | 30 + src/planar_complex.cu | 87 +++ tests/CMakeLists.txt | 5 + tests/CMakeLists.txt~ | 5 + tests/dsaX_beamformer_correlator_test.cpp | 399 +++++++++++++ tests/dsaX_beamformer_correlator_test.cpp~ | 398 +++++++++++++ {utils => tests/utils}/.gitignore | 0 tests/utils/CMakeLists.txt | 11 + tests/utils/CMakeLists.txt~ | 22 + {utils => tests/utils}/gen_packet.py | 0 {utils => tests/utils}/get_rms.py | 0 {utils => tests/utils}/get_rms_packet.py | 0 {utils => tests/utils}/packet.out | Bin {utils => tests/utils}/sockets.py | 0 {utils => tests/utils}/test.out | Bin 33 files changed, 2909 insertions(+), 90 deletions(-) create mode 100644 include/CMakeLists.txt create mode 100644 include/dsaX_beamformer_correlator.h create mode 100644 include/dsaX_blas_interface.h create mode 100644 include/dsaX_cublas_interface.h create mode 100644 include/dsaX_cuda_headers.h create mode 100644 include/dsaX_cuda_interface.h create mode 100644 include/dsaX_enums.h create mode 100644 include/dsaX_psrdada_headers.h create mode 100644 include/dsaX_utils.h create mode 100644 src/10_planar_complex.cu create mode 100644 src/11_planar_complex_array.cu create mode 100644 src/dsaX_blas_interface.cu create mode 100644 src/dsaX_cublas_interface.cu create mode 100644 src/dsaX_cuda_interface.cu create mode 100644 src/dsaX_utils.cpp create mode 100644 src/planar_complex.cu create mode 100644 tests/CMakeLists.txt create mode 100644 tests/CMakeLists.txt~ create mode 100644 tests/dsaX_beamformer_correlator_test.cpp create mode 100644 tests/dsaX_beamformer_correlator_test.cpp~ rename {utils => tests/utils}/.gitignore (100%) create mode 100644 tests/utils/CMakeLists.txt create mode 100644 tests/utils/CMakeLists.txt~ rename {utils => tests/utils}/gen_packet.py (100%) rename {utils => tests/utils}/get_rms.py (100%) rename {utils => tests/utils}/get_rms_packet.py (100%) rename {utils => tests/utils}/packet.out (100%) rename {utils => tests/utils}/sockets.py (100%) rename {utils => tests/utils}/test.out (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f29ca32..441ae7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C) set(CMAKE_C_FLAGS "-pthread") set(CMAKE_CXX_FLAGS "-pthread") -# add a directory for cmake modules +# add a directory for cmake modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") # DSA_XENGINE may be built to run using CUDA or CPU. Future version may be @@ -48,6 +48,7 @@ if(TARGET_TYPE_VALID LESS 0) message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") endif() + # Git helpers #------------ find_package(Git) @@ -268,9 +269,8 @@ else() endif() -# Add src, utils, include, tests, and legacy +# Add src, include, tests, and legacy add_subdirectory(src) -add_subdirectory(utils) add_subdirectory(include) add_subdirectory(tests) option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF) diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt new file mode 100644 index 0000000..1bbdfda --- /dev/null +++ b/include/CMakeLists.txt @@ -0,0 +1,15 @@ +enable_language(CUDA) + +# install step for header files +#------------------------------ +set(DSA_XENGINE_HEADERS + # cmake-format: sortable + dsaX_cuda_headers.h + dsaX_capture.h + dsaX_capture_manythread.h + dsaX_capture_pcap.h + dsaX_def.h + dsaX_cutlass_interface.h + ) +install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) +#------------------------------ diff --git a/include/dsaX.h b/include/dsaX.h index ddd351a..2ee856a 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -62,10 +62,10 @@ typedef struct dmem { } dmem; // Structure that carries BLAS parameters -typedef struct dsaBLASParam_s { +typedef struct dsaXBLASParam_s { size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ - dsaBLASType blas_type; /**< Type of BLAS computation to perfrom */ + dsaXBLASType blas_type; /**< Type of BLAS computation to perfrom */ // GEMM params dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ @@ -87,10 +87,10 @@ typedef struct dsaBLASParam_s { // Common params int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ - dsaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ - dsaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ -} dsaBLASParam; +} dsaXBLASParam; // Initialise device memory diff --git a/include/dsaX_beamformer_correlator.h b/include/dsaX_beamformer_correlator.h new file mode 100644 index 0000000..7001f4a --- /dev/null +++ b/include/dsaX_beamformer_correlator.h @@ -0,0 +1,9 @@ +#pragma once + +// correlator function +// workflow: copy to device, reorder, stridedBatchedGemm, reorder +void dcorrelator(dmem *d); + +// beamformer function +void dbeamformer(dmem * d); + diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h new file mode 100644 index 0000000..3cf5c4a --- /dev/null +++ b/include/dsaX_blas_interface.h @@ -0,0 +1,5 @@ +#pragma once + +#include "dsaX.h" + +void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param); diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h new file mode 100644 index 0000000..9265f37 --- /dev/null +++ b/include/dsaX_cublas_interface.h @@ -0,0 +1,5 @@ +#pragma once +#include "dsaX.h" +#include "dsaX_cuda_headers.h" + +void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param); diff --git a/include/dsaX_cuda_headers.h b/include/dsaX_cuda_headers.h new file mode 100644 index 0000000..acc838d --- /dev/null +++ b/include/dsaX_cuda_headers.h @@ -0,0 +1,6 @@ +#pragma once + +#include +#include "cuda_fp16.h" +#include +#include diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h new file mode 100644 index 0000000..99b1db2 --- /dev/null +++ b/include/dsaX_cuda_interface.h @@ -0,0 +1,31 @@ +#pragma once + +#include "dsaX.h" +#include "dsaX_def.h" + +void initialize_device_memory(dmem * d, int bf); + +void deallocate_device_memory(dmem * d, int bf); + +void reorder_output_device(dmem * d); + +__global__ void corr_input_copy(char *input, half *inr, half *ini); + +template __global__ void transpose_matrix(in_prec * idata, out_prec * odata); + +void reorder_input_device(char *input, char * tx, half *inr, half *ini); + +__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup); + +__global__ void transpose_input_bf(double * idata, double * odata); + +__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs); + +void calc_weights(dmem * d); + +__global__ void fluff_input_bf(char * input, half * dr, half * di); + +__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata); + +__global__ void sum_beam(unsigned char * input, float * output); + diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h new file mode 100644 index 0000000..b188019 --- /dev/null +++ b/include/dsaX_enums.h @@ -0,0 +1,33 @@ +#pragma once + +#define DSA_INVALID_ENUM (-0x7fffffff - 1) + +typedef enum dsaError_t { DSA_SUCCESS = 0, DSA_ERROR = 1, DSA_ERROR_UNINITIALIZED = 2 } dsaError_t; + +typedef enum dsaBLASOperation_s { + DSA_BLAS_OP_N = 0, // No transpose + DSA_BLAS_OP_T = 1, // Transpose only + DSA_BLAS_OP_C = 2, // Conjugate transpose + DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM +} dsaBLASOperation; + +typedef enum dsaXBLASType_s { + DSA_BLAS_GEMM = 0, + DSA_BLAS_INVALID = DSA_INVALID_ENUM +} dsaXBLASType; + +typedef enum dsaXBLASDataType_s { + DSA_BLAS_DATATYPE_H = 0, // Half + DSA_BLAS_DATATYPE_S = 1, // Single + DSA_BLAS_DATATYPE_D = 2, // Double + DSA_BLAS_DATATYPE_HC = 3, // Complex(half) + DSA_BLAS_DATATYPE_C = 4, // Complex(single) + DSA_BLAS_DATATYPE_Z = 5, // Complex(double) + DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM +} dsaXBLASDataType; + +typedef enum dsaXBLASDataOrder_s { + DSA_BLAS_DATAORDER_ROW = 0, + DSA_BLAS_DATAORDER_COL = 1, + DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM +} dsaXBLASDataOrder; diff --git a/include/dsaX_psrdada_headers.h b/include/dsaX_psrdada_headers.h new file mode 100644 index 0000000..325dcb8 --- /dev/null +++ b/include/dsaX_psrdada_headers.h @@ -0,0 +1,12 @@ +#pragma once + +#include "dada_client.h" +#include "dada_def.h" +#include "dada_hdu.h" +#include "multilog.h" +#include "ipcio.h" +#include "ipcbuf.h" +#include "dada_affinity.h" +#include "ascii_header.h" +#include "dsaX_def.h" +#include "dsaX_enums.h" diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h new file mode 100644 index 0000000..3976db7 --- /dev/null +++ b/include/dsaX_utils.h @@ -0,0 +1,9 @@ +#pragma once + +#include "dsaX.h" + +void dsaXmemset(void *array, int ch, size_t n); + +void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n); +void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n); + diff --git a/src/10_planar_complex.cu b/src/10_planar_complex.cu new file mode 100644 index 0000000..9e0915d --- /dev/null +++ b/src/10_planar_complex.cu @@ -0,0 +1,567 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex GEMM + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting + the batched strided mode. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile + size, and alignment. This can result in long compile times. + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn + + $ make 10_planar_complex + + $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "10_planar_complex example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/10_planar_complex/10_planar_complex --batch=7 --m=1024 --n=512 --k=1024 \\\n" + << " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + using ElementA = cutlass::half_t; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = cutlass::half_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = cutlass::half_t; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched strided GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + ElementA *ptr_A = tensor_A.get(); + ElementB *ptr_B = tensor_B.get(); + ElementC *ptr_C = tensor_C.get(); + ElementC *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMMs + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex( + problem_size.m(), // GEMM M dimension + problem_size.n(), // GEMM N dimension + problem_size.k(), // GEMM K dimension + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + ptr_A, // Pointer to real part of A matrix + ptr_A + imag_stride_A, // Pointer to imaginary part of A matrix + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + ptr_B, // Pointer to real part of B matrix + ptr_B + imag_stride_B, // Pointer to imaginary part of B matrix + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C, // Pointer to real part of C matrix + ptr_C + imag_stride_C, // Pointer to imaginary part of C matrix + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D, // Pointer to real part of D matrix + ptr_D + imag_stride_D, // Pointer to imaginary part of D matrix + ldd, // Leading dimension of real part of D matrix + ldd, // Leading dimension of imaginary part of D matrix + + batch_count, // Number of batched elements + + batch_stride_A, // Stride between batches of real parts of A matrix + batch_stride_A, // Stride between batches of imaginary parts of A matrix + + batch_stride_B, // Stride between batches of real parts of B matrix + batch_stride_B, // Stride between batches of imaginary parts of B matrix + + batch_stride_C, // Stride between batches of real parts of C matrix + batch_stride_C, // Stride between batches of imaginary parts of C matrix + + batch_stride_D, // Stride between batches of real parts of D matrix + batch_stride_D // Stride between batches of imaginary parts of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMMs are complete + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + ElementC epsilon = 0.1_hf; + ElementC nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu new file mode 100644 index 0000000..ba94b60 --- /dev/null +++ b/src/11_planar_complex_array.cu @@ -0,0 +1,628 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Planar Complex Array Example + + This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which + execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays + in global memory. + + These kernels represent complex matrices by storing the real and imaginary parts of the matrix in + disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts + as either column-major or row-major layouts with a single leading dimension indicating the stride + between columns or rows. + + The CUTLASS Library collects multiple template instantiations in a data structure and offers + a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures. + + CUTLASS decouples matrix layout from complex transformation, so four possible transformations + are possible on the A and B operands: + + n: column-major + c: column-major complex conjugate + t: row-major + h: row-major complex conjugate + + To build strictly the planar complex kernels needed for general application, execute the following + CMake command in an empty build directory. + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex + + This builds all planar complex GEMM variants for Volta and Turing architectures. + + To build strictly the kernels needed for this example, an even narrower filter string may be + specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for + the 'CN' layout configuration (conjugate A operand with both A and B as column-major). + + $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \ + -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn + + $ make 11_planar_complex_array + + $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10 +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor_planar_complex.h" + +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "cutlass/util/reference/device/gemm_planar_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "cutlass/library/handle.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Result structure +struct Result { + + double runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + // + // Methods + // + + Result( + double runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess + ): + runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::complex alpha; + cutlass::complex beta; + + bool reference_check; + int iterations; + + Options(): + help(false), + problem_size({1024, 1024, 1024}), + batch_count(1), + reference_check(true), + iterations(20), + alpha(1), + beta() { } + + bool valid() { + return true; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + cmd.get_cmd_line_argument("m", problem_size.m()); + cmd.get_cmd_line_argument("n", problem_size.n()); + cmd.get_cmd_line_argument("k", problem_size.k()); + cmd.get_cmd_line_argument("batch", batch_count); + + cmd.get_cmd_line_argument("alpha", alpha.real()); + cmd.get_cmd_line_argument("alpha_i", alpha.imag()); + cmd.get_cmd_line_argument("beta", beta.real()); + cmd.get_cmd_line_argument("beta_i", beta.imag()); + + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "11_planar_complex_array example\n\n" + << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= GEMM M dimension\n" + << " --n= GEMM N dimension\n" + << " --k= GEMM K dimension\n" + << " --batch= Number of GEMM operations executed in one batch\n" + << " --alpha= Epilogue scalar alpha (real part)\n" + << " --alpha_i= Epilogue scalar alpha (imaginary part)\n" + << " --beta= Epilogue scalar beta (real part)\n\n" + << " --beta_i= Epilogue scalar beta (imaginary part)\n\n" + << " --iterations= Number of profiling iterations to perform.\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of real-valued multiply-adds + int64_t fmas = problem_size.product() * batch_count * 4; + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Performance test environment for planar complex +class TestbedPlanarComplex { +public: + + // Half-precision input and output + using Element = cutlass::half_t; + + // Configurations for layouts and internal computation + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using ElementCompute = float; + using ElementAccumulator = float; + + // + // Data members + // + + cutlass::library::Handle handle; + + cutlass::gemm::GemmCoord problem_size; + int batch_count; + cutlass::DeviceAllocation tensor_A; + cutlass::DeviceAllocation tensor_B; + cutlass::DeviceAllocation tensor_C; + cutlass::DeviceAllocation tensor_D; + cutlass::DeviceAllocation tensor_D_ref; + + cutlass::DeviceAllocation ptr_A_real; + cutlass::DeviceAllocation ptr_A_imag; + cutlass::DeviceAllocation ptr_B_real; + cutlass::DeviceAllocation ptr_B_imag; + cutlass::DeviceAllocation ptr_C_real; + cutlass::DeviceAllocation ptr_C_imag; + cutlass::DeviceAllocation ptr_D_real; + cutlass::DeviceAllocation ptr_D_imag; + + // + // Methods + // + + TestbedPlanarComplex( + Options const &options + ): + problem_size(options.problem_size), batch_count(options.batch_count) { + + // Allocate device memory for batched planar complex GEMM + tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2); + tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2); + tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2); + + ptr_A_real.reset(batch_count); + ptr_A_imag.reset(batch_count); + ptr_B_real.reset(batch_count); + ptr_B_imag.reset(batch_count); + ptr_C_real.reset(batch_count); + ptr_C_imag.reset(batch_count); + ptr_D_real.reset(batch_count); + ptr_D_imag.reset(batch_count); + + } + + void initialize() { + + uint64_t seed = 1073; + + // Use small integers to simplify correctness checking + int scope_max = 6; + int scope_min = -6; + + cutlass::reference::device::BlockFillRandomUniform( + tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0); + + cutlass::reference::device::BlockFillRandomUniform( + tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0); + } + + Result profile(Options const &options) { + + Result result; + + initialize(); + + Element *ptr_A = tensor_A.get(); + Element *ptr_B = tensor_B.get(); + Element *ptr_C = tensor_C.get(); + Element *ptr_D = tensor_D.get(); + + int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2; + int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2; + int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2; + int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2; + + typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0); + typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); + + + int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); + int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); + int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); + int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n(); + + // + // Configure pointers in global memory + // + + struct { + Element *base; + void **ptr_real; + void **ptr_imag; + int64_t batch_stride; + int64_t imag_stride; + } tensors[] = { + { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A}, + { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B}, + { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C}, + { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D} + }; + + for (auto const &tensor : tensors) { + for (int idx = 0; idx < batch_count; ++idx) { + + void *ptr_real = tensor.base + idx * tensor.batch_stride; + void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride; + + cudaError_t error = cudaMemcpy( + tensor.ptr_real + idx, + &ptr_real, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + + error = cudaMemcpy( + tensor.ptr_imag + idx, + &ptr_imag, + sizeof(void *), + cudaMemcpyHostToDevice); + + if (error != cudaSuccess) { + throw std::runtime_error("Failed to copy pointer to device memory"); + } + } + } + + // + // Construct events + // + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return -1; + } + } + + // Record an event at the start of a series of GEMM operations + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // + // Run profiling loop + // + + for (int iter = 0; iter < options.iterations; ++iter) { + + // + // Execute the planar complex array GEMM kernel via the CUTLASS Library's + // dispatch routines. + // + // Note, for planar complex array GEMM kernels, all numeric type arguments + // specify the data type of the base real types. These are understood to + // apply to planar complex representations of matrices in memory and to complex + // structures for scalars. + // + // See tools/library/include/cutlass/library/handle.h for more details. + // + + result.status = handle.gemm_planar_complex_array( + + problem_size.m(), // expected GEMM M dimension + problem_size.n(), // expected GEMM N dimension + problem_size.k(), // expected GEMM K dimension + batch_count, // Number of batched elements + + nullptr, + nullptr, + nullptr, + + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued accumulation + cutlass::library::NumericTypeID::kF32, // Base data type of complex-valued alpha/beta scalars + + &options.alpha, // Pointer to alpha scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued A matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of A matrix + cutlass::library::ComplexTransform::kConjugate, // Complex transformation on A matrix operand + + ptr_A_real.get(), // Pointer to array of pointers to real part of A matrix + ptr_A_imag.get(), // Pointer to array of pointers to imaginary part of A matrix + + lda, // Leading dimension of real part of A matrix + lda, // Leading dimension of imaginary part of A matrix + + cutlass::library::NumericTypeID::kF16, // Base data type of complex-valued B matrix + cutlass::library::LayoutTypeID::kColumnMajor, // Layout of B matrix + cutlass::library::ComplexTransform::kNone, // Complex transformation on B matrix operand + + ptr_B_real.get(), // Pointer to array of pointers to real part of B matrix + ptr_B_imag.get(), // Pointer to array of pointers to imaginary part of B matrix + + ldb, // Leading dimension of real part of B matrix + ldb, // Leading dimension of imaginary part of B matrix + + &options.beta, // Pointer to beta scalar, of type complex + + cutlass::library::NumericTypeID::kF16, // Base data type of complex valued C and D matrices + + ptr_C_real.get(), // Pointer to array of pointers to real part of C matrix + ptr_C_imag.get(), // Pointer to array of pointers to imaginary part of C matrix + + ldc, // Leading dimension of real part of C matrix + ldc, // Leading dimension of imaginary part of C matrix + + ptr_D_real.get(), // Pointer to array of pointers to real part of D matrix + ptr_D_imag.get(), // Pointer to array of pointers to imaginary part of D matrix + + ldd, // Leading dimension of real part of D matrix + ldd // Leading dimension of imaginary part of D matrix + ); + + if (result.status != cutlass::Status::kSuccess) { + std::cerr << "CUTLASS internal error - configuration not supported" << std::endl; + return result; + } + } + + // + // Stop profiling loop + // + + // Record an event when the GEMM operations have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Compute average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + + if (handle.get_last_operation()) { + std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl; + } + + // + // Compute reference in device code + // + + if (options.reference_check) { + + result.passed = true; + + for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) { + cutlass::reference::device::GemmPlanarComplex< + Element, LayoutA, + Element, LayoutB, + Element, LayoutC, + ElementAccumulator + >( + problem_size, + options.alpha, + {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A}, + cutlass::ComplexTransform::kConjugate, + {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B}, + cutlass::ComplexTransform::kNone, + options.beta, + {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C}, + {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D} + ); + + Element epsilon = 0.1_hf; + Element nonzero_floor = 0.1_hf; + + result.passed = cutlass::reference::device::BlockCompareRelativelyEqual( + tensor_D.get() + idx * batch_stride_D, + tensor_D_ref.get() + idx * batch_stride_D, + batch_stride_D, + epsilon, + nonzero_floor + ); + } + + if (result.passed) { + std::cout << "Reference check passed." << std::endl; + } + else { + std::cerr << "Error - reference check failed." << std::endl; + } + } + + std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; + std::cout << " GFLOPs: " << result.gflops << std::endl; + + return result; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // + // This example uses mma.sync to directly access Tensor Cores to achieve peak performance. + // + // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit. + // + // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit. + // + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (props.major < 7) { + std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." + << std::endl; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; + } + else if (props.major == 7 && props.minor <= 2) { + // + // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else if (props.major == 7 && props.minor >= 5) { + // + // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; + } + } + else { + // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond. + // + // fall through + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + TestbedPlanarComplex testbed(options); + + Result result = testbed.profile(options); + + return result.passed ? 0 : -1; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a55c8fe..e23827b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,7 +4,19 @@ include_directories(../include) # DSA Fast Time Domain library #----------------------------- -add_library(dsa SHARED dsaX_cuda_interface.cu dsaX_blas_interface.cu dsaX_beamformer_correlator.cu) +add_library(dsa SHARED + dsaX_cuda_interface.cu + dsaX_blas_interface.cu + dsaX_beamformer_correlator.cu + dsaX_utils.cpp + ) + +if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) + add_compile_definitions(DSA_XENGINE_TARGET_CUDA) +endif() +if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU) + add_compile_definitions(DSA_XENGINE_TARGET_CPU) +endif() if(CUDAToolkit_FOUND) target_link_libraries(dsa CUDA::cudart) diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu index c91c1b7..ddbc73c 100644 --- a/src/dsaX_beamformer_correlator.cu +++ b/src/dsaX_beamformer_correlator.cu @@ -9,13 +9,11 @@ Workflow is similar for BF and corr applications #include "dsaX_def.h" #include "dsaX.h" #include "dsaX_blas_interface.h" - -//#include -//#include "cuda_fp16.h" -//#include -//#include - +#include "dsaX_utils.h" +#include "dsaX_blas_interface.h" +#ifdef DSA_XENGINE_TARGET_CUDA #include "dsaX_cuda_interface.h" +#endif int DEBUG = 1; @@ -48,81 +46,43 @@ void usage() { // workflow: copy to device, reorder, stridedBatchedGemm, reorder void dcorrelator(dmem *d) { + // copy to device + dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + // zero out output arrays - cudaMemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - cudaMemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - cudaMemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); - // copy to device - cudaMemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, cudaMemcpyHostToDevice); - // reorder input reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i); - // ABSTRACT HERE START - // not sure if essential - cudaDeviceSynchronize(); - - // set up for gemm - cublasHandle_t cublasH = NULL; - cudaStream_t stream = NULL; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - cublasCreate(&cublasH); - cublasSetStream(cublasH, stream); - + dsaXBLASParam blas_param; // gemm settings // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] - cublasOperation_t transa = CUBLAS_OP_N; - cublasOperation_t transb = CUBLAS_OP_T; - const int m = NANTS; - const int n = NANTS; - const int k = NPACKETS_PER_BLOCK/halfFac; - const half alpha = 1.; - const half malpha = -1.; - const int lda = m; - const int ldb = n; - const half beta0 = 0.; - const half beta1 = 1.; - const int ldc = m; - const long long int strideA = NPACKETS_PER_BLOCK*NANTS/halfFac; - const long long int strideB = NPACKETS_PER_BLOCK*NANTS/halfFac; - const long long int strideC = NANTS*NANTS; - const int batchCount = NCHAN_PER_PACKET*2*2*halfFac; - - // run strided batched gemm - // ac - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_r,lda,strideA, - d->d_r,ldb,strideB,&beta0, - d->d_outr,ldc,strideC, - batchCount); - // bd - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_i,lda,strideA, - d->d_i,ldb,strideB,&beta1, - d->d_outr,ldc,strideC, - batchCount); - // -bc - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &malpha,d->d_i,lda,strideA, - d->d_r,ldb,strideB,&beta0, - d->d_outi,ldc,strideC, - batchCount); - // ad - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_r,lda,strideA, - d->d_i,ldb,strideB,&beta1, - d->d_outi,ldc,strideC, - batchCount); - - // shown to be essential - cudaDeviceSynchronize(); - - // destroy stream - cudaStreamDestroy(stream); - cublasDestroy(cublasH); + blas_param.trans_a = DSA_BLAS_OP_N; + blas_param.trans_b = DSA_BLAS_OP_T; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + + // Perform GEMM accoring to back end configuration + dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); + + /* + // ABSTRACT HERE START // ABSTRACT HERE END + */ // reorder output data reorder_output_device(d); @@ -174,7 +134,7 @@ void dbeamformer(dmem * d) { // do big memcpy begin = clock(); - cudaMemcpy(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4,cudaMemcpyHostToDevice); + dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4); end = clock(); d->cp += (float)(end - begin) / CLOCKS_PER_SEC; @@ -182,8 +142,8 @@ void dbeamformer(dmem * d) { for (int iArm=0;iArm<2;iArm++) { // zero out output arrays - cudaMemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); - cudaMemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); cudaDeviceSynchronize(); // copy data to device @@ -443,8 +403,8 @@ int main (int argc, char *argv[]) { for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); - cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); - + dsaXmemcpyHostToDevice(d.d_freqs, d.h_freqs, sizeof(float)*(NCHAN_PER_PACKET/8)); + // calculate weights calc_weights(&d); @@ -471,7 +431,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO,"copy to host"); output_size = NBASE*NCHAN_PER_PACKET*2*2*4; output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + dsaXmemcpyDeviceToHost(output_data, d.d_output, output_size); fout = fopen("output.dat","wb"); fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); @@ -483,7 +443,7 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO,"copy to host"); output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); + dsaXmemcpyDeviceToHost(output_data, d.d_bigpower, output_size); /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); o1 = (char *)malloc(output_size); @@ -606,13 +566,13 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO,"run correlator"); dcorrelator(&d); if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); + dsaXmemcpyDeviceToHost(output_buffer, d.d_output, block_out); } else { if (DEBUG) syslog(LOG_INFO,"run beamformer"); dbeamformer(&d); if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); + dsaMXmemcpyDeviceToHost(output_buffer, d.d_bigpower, block_out); } //end = clock(); //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu new file mode 100644 index 0000000..430ba9e --- /dev/null +++ b/src/dsaX_blas_interface.cu @@ -0,0 +1,11 @@ +#include +#include "dsaX_cublas_interface.h" + +void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) { +#ifdef DSA_XENGINE_TARGET_CUDA + dsaXHgemmStridedBatchedCuda(real_in, imag_in, real_out, imag_out, param); +#else + std::cout "Not implemented" << std::endl; + exit(0); +#endif +} diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu new file mode 100644 index 0000000..4631516 --- /dev/null +++ b/src/dsaX_cublas_interface.cu @@ -0,0 +1,92 @@ +#include "dsaX_cublas_interface.h" + +void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) { +#ifdef DSA_XENGINE_TARGET_CUDA + + // not sure if essential + cudaDeviceSynchronize(); + + // Set up for gemm + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cublasCreate(&cublasH); + cublasSetStream(cublasH, stream); + + // Transfer params + cublasOperation_t transa; + cublasOperation_t transb; + switch (blas_param.trans_a) { + case DSA_BLAS_OP_N: + transa = CUBLAS_OP_N; break; + case DSA_BLAS_OP_T: + transa = CUBLAS_OP_T; break; + case DSA_BLAS_OP_C: + transa = CUBLAS_OP_C; break; + default: + std::cout << "Unknown cublas transpose" << std::end; + } + + switch (blas_param.trans_b) { + case DSA_BLAS_OP_N: + transb = CUBLAS_OP_N; break; + case DSA_BLAS_OP_T: + transb = CUBLAS_OP_T; break; + case DSA_BLAS_OP_C: + transb = CUBLAS_OP_C; break; + default: + std::cout << "Unknown cublas transpose" << std::end; + } + + const int m = blas_param.m; + const int n = blas_param.n; + const int k = blas_param.k; + const half alpha = blas_param.alpha.real(); + const half malpha = -1.0 * alpha; + const int lda = blas_param.lda; + const int ldb = blas_param.ldb; + const half beta0 = blas_param.beta.real(); + const half beta1 = 1.0; + const int ldc = blas_param.ldc; + const long long int strideA = blas_param.a_stride; + const long long int strideB = blas_param.b_stride; + const long long int strideC = blas_param.c_stride; + const int batchCount = blas_param.batch_count; + + // run strided batched gemm for datatype (a + ib)(c + id) + // ac + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_r,lda,strideA, + d->d_r,ldb,strideB,&beta0, + d->d_outr,ldc,strideC, + batchCount); + // bd + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_i,lda,strideA, + d->d_i,ldb,strideB,&beta1, + d->d_outr,ldc,strideC, + batchCount); + // -bc + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &malpha,d->d_i,lda,strideA, + d->d_r,ldb,strideB,&beta0, + d->d_outi,ldc,strideC, + batchCount); + // ad + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_r,lda,strideA, + d->d_i,ldb,strideB,&beta1, + d->d_outi,ldc,strideC, + batchCount); + + // shown to be essential + cudaDeviceSynchronize(); + + // destroy stream + cudaStreamDestroy(stream); + cublasDestroy(cublasH); +#else + std::cout "Not implemented" << std::endl; + exit(0); +#endif +} diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu new file mode 100644 index 0000000..31e44d0 --- /dev/null +++ b/src/dsaX_cuda_interface.cu @@ -0,0 +1,467 @@ +#include "dsaX_cuda_interface.h" + +// allocate device memory +void initialize_device_memory(dmem * d, int bf) { + + // for correlator + if (bf==0) { + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + } + + // for beamformer + if (bf==1) { + cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); + cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor + cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor + + // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] + d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); + d->flagants = (int *)malloc(sizeof(int)*NANTS); + d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); + + // timers + d->cp = 0.; + d->prep = 0.; + d->outp = 0.; + d->cubl = 0.; + + } +} + +// deallocate device memory +void deallocate_device_memory(dmem * d, int bf) { + + cudaFree(d->d_input); + + if (bf==0) { + cudaFree(d->d_r); + cudaFree(d->d_i); + cudaFree(d->d_tx); + cudaFree(d->d_output); + cudaFree(d->d_outr); + cudaFree(d->d_outi); + cudaFree(d->d_tx_outr); + cudaFree(d->d_tx_outi); + } + if (bf==1) { + cudaFree(d->d_tx); + cudaFree(d->d_br); + cudaFree(d->d_bi); + cudaFree(d->weights_r); + cudaFree(d->weights_i); + cudaFree(d->d_bigbeam_r); + cudaFree(d->d_bigbeam_i); + cudaFree(d->d_bigpower); + cudaFree(d->d_scf); + cudaFree(d->d_chscf); + free(d->h_winp); + free(d->flagants); + cudaFree(d->d_freqs); + free(d->h_freqs); + } +} + +// function to copy d_outr and d_outi to d_output +// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS] +// the corr matrices are column major order +// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] +// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel +void reorder_output_device(dmem * d) { + + // transpose input data + dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32); + transpose_matrix<<>>(d->d_outr,d->d_tx_outr); + transpose_matrix<<>>(d->d_outi,d->d_tx_outi); + + // look at output + /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); + cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); + FILE *fout; + fout=fopen("test2.test","wb"); + fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout); + fclose(fout);*/ + + + /* + // set up for geam + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cublasSetStream(cublasH, stream); + + // transpose output matrices into tx_outr and tx_outi + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + const int m = NCHAN_PER_PACKET*2*2; + const int n = NANTS*NANTS/16; // columns in output + const double alpha = 1.0; + const double beta = 0.0; + const int lda = n; + const int ldb = m; + const int ldc = ldb; + cublasDgeam(cublasH,transa,transb,m,n, + &alpha,(double *)(d->d_outr), + lda,&beta,(double *)(d->d_tx_outr), + ldb,(double *)(d->d_tx_outr),ldc); + cublasDgeam(cublasH,transa,transb,m,n, + &alpha,(double *)(d->d_outi), + lda,&beta,(double *)(d->d_tx_outi), + ldb,(double *)(d->d_tx_outi),ldc); + */ + // now run kernel to sum into output + int * h_idxs = (int *)malloc(sizeof(int)*NBASE); + int * d_idxs; + cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE); + int ii = 0; + // upper triangular order (column major) to match xGPU (not the same as CASA!) + for (int i=0;i>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs); + + /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); + cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); + FILE *fout; + fout=fopen("test3.test","wb"); + fwrite(odata,sizeof(char),384*4*NBASE*4,fout); + fclose(fout);*/ + + + cudaFree(d_idxs); + free(h_idxs); + //cudaStreamDestroy(stream); + +} + +// kernel to fluff input +// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks +__global__ void corr_input_copy(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 + int tidx = threadIdx.x; // assume 128 + int iidx = bidx*128+tidx; + + inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); + ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + +} + +// transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +template __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { + + __shared__ in_prec tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + + +// function to copy and reorder d_input to d_r and d_i +// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] +// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] +// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. +// then fluffs using simple kernel +void reorder_input_device(char *input, char * tx, half *inr, half *ini) { + + // transpose input data + dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); + transpose_matrix<<>>(input,tx); + corr_input_copy<<>>(tx,inr,ini); +} + +// kernel to help with reordering output +// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] +// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads +__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { + + int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); + int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); + int ch = (int)(chpol / 2); + int base_idx = indices_lookup[baseline]; + int iidx = base_idx * NCHAN_PER_PACKET + ch; + int pol = (int)(chpol % 2); + + float v1=0., v2=0.; + + for (int i=0;ih_winp[2*i]; + antpos_n[i] = d->h_winp[2*i+1]; + } + for (int i=0;inflags;j++) + //if (d->flagants[j]==iant) found = 1; + + calibs[2*i] = d->h_winp[2*NANTS+2*i]; + calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1]; + + wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]); + if (wnorm!=0.0) { + calibs[2*i] /= wnorm; + calibs[2*i+1] /= wnorm; + } + + //if (found==1) { + //calibs[2*i] = 0.; + //calibs[2*i+1] = 0.; + //} + } + + //for (int i=0;i>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs); + + // free stuff + cudaFree(d_antpos_e); + cudaFree(d_antpos_n); + cudaFree(d_calibs); + free(antpos_e); + free(antpos_n); + free(calibs); + +} + +// kernel to fluff input bf data +// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads +__global__ void fluff_input_bf(char * input, half * dr, half * di) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); + di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + +} + +// transpose, add and scale kernel for bf +// assume breakdown into tiles of 16x16, and run with 16x8 threads per block +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) +// scf is a per-beam scale factor to enable recasting as unsigned char +__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) { + + __shared__ float tile[16][17]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + float dr, di; + + for (int j = 0; j < 16; j += 8) { + dr = (float)(ir[(y+j)*width + x]); + di = (float)(ii[(y+j)*width + x]); + tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) + odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + +} + +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +__global__ void sum_beam(unsigned char * input, float * output) { + + __shared__ float summ[512]; + int bidx = blockIdx.x; + int tidx = threadIdx.x; + //int idx = bidx*256+tidx; + int bm = (int)(bidx/48); + int ch = (int)(bidx % 48); + + summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); + + __syncthreads(); + + if (tidx<256) { + summ[tidx] += summ[tidx+256]; + summ[tidx] += summ[tidx+128]; + summ[tidx] += summ[tidx+64]; + summ[tidx] += summ[tidx+32]; + summ[tidx] += summ[tidx+16]; + summ[tidx] += summ[tidx+8]; + summ[tidx] += summ[tidx+4]; + summ[tidx] += summ[tidx+2]; + summ[tidx] += summ[tidx+1]; + } + + if (tidx==0) output[bidx] = summ[tidx]; + +} diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp new file mode 100644 index 0000000..46abfc9 --- /dev/null +++ b/src/dsaX_utils.cpp @@ -0,0 +1,30 @@ +#include "dsaX_utils.h" +#ifdef DSA_XENGINE_TARGET_CUDA +#include "dsaX_cuda_headers.h" +#endif + +void dsaXmemset(void *array, int ch, size_t n){ +#ifdef DSA_XENGINE_TARGET_CUDA + cudaMemset(array, ch, n); +#else + emset(array, ch, n); +#endif +} + +void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n){ +#ifdef DSA_XENGINE_TARGET_CUDA + // Perform host to device memcopy on data + cudaMemcpy(array_device, array_host, n, cudaMemcpyHostToDevice); +#else + memcpy(array_device, array_host, n); +#endif +} + +void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){ +#ifdef DSA_XENGINE_TARGET_CUDA + // Perform host to device memcopy on data + cudaMemcpy(array_host, array_device, n, cudaMemcpyDeviceToHost); +#else + memcpy(array_host, array_device, n); +#endif +} diff --git a/src/planar_complex.cu b/src/planar_complex.cu new file mode 100644 index 0000000..3fb8175 --- /dev/null +++ b/src/planar_complex.cu @@ -0,0 +1,87 @@ +/* +#include +#include +#include +#include + +int main() { + + cutlass::half_t x = 2.25_hf; + + std::cout << x << std::endl; + + return 0; +} +*/ + +#include +#include + +#include + +int main() { + + // Define the GEMM operation + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, // ElementA + cutlass::layout::ColumnMajor, // LayoutA + cutlass::half_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + cutlass::half_t, // ElementOutput + cutlass::layout::ColumnMajor, // LayoutOutput + float, // ElementAccumulator + cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores + cutlass::arch::Sm75 // tag indicating target GPU compute architecture + >; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::HostTensor A({M, K}); + cutlass::HostTensor B({K, N}); + cutlass::HostTensor C({M, N}); + + cutlass::half_t const *ptrA = A.device_data(); + cutlass::half_t const *ptrB = B.device_data(); + cutlass::half_t const *ptrC = C.device_data(); + cutlass::half_t *ptrD = C.device_data(); + + int lda = A.device_ref().stride(0); + int ldb = B.device_ref().stride(0); + int ldc = C.device_ref().stride(0); + int ldd = C.device_ref().stride(0); + // + // Launch GEMM on the device + // + + status = gemm_op({ + {M, N, K}, + {ptrA, lda}, // TensorRef to A device tensor + {ptrB, ldb}, // TensorRef to B device tensor + {ptrC, ldc}, // TensorRef to C device tensor + {ptrD, ldd}, // TensorRef to D device tensor - may be the same as C + {alpha, beta} // epilogue operation arguments + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } else { + std::cout << "CUTLASS Success! " << std::endl; + } + + return 0; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..9d29854 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,5 @@ + +#include_directories(../include) +include_directories(${CLI11_SOURCE_DIR}/include/CLI) +add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp) + diff --git a/tests/CMakeLists.txt~ b/tests/CMakeLists.txt~ new file mode 100644 index 0000000..f72156b --- /dev/null +++ b/tests/CMakeLists.txt~ @@ -0,0 +1,5 @@ + +#include_directories(../include) +include_directories(${CLI11_SOURCE_DIR}/src) +add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp) + diff --git a/tests/dsaX_beamformer_correlator_test.cpp b/tests/dsaX_beamformer_correlator_test.cpp new file mode 100644 index 0000000..3e723d0 --- /dev/null +++ b/tests/dsaX_beamformer_correlator_test.cpp @@ -0,0 +1,399 @@ +#include +#include +#include +#include +#include + +// Include the dsaX.h header in your application +//#include + +int main(int argc, char **argv) { + + /* + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // DADA Header plus Data Unit + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = REORDER_BLOCK_KEY; + key_t out_key = XGPU_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg = 0; + int bf = 0; + int test = 0; + char ftest[200], fflagants[200], fcalib[200]; + float sfreq = 1498.75; + + while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + test = 1; + syslog(LOG_INFO, "test mode"); + if (sscanf (optarg, "%s", &ftest) != 1) { + syslog(LOG_ERR, "could not read test file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'a': + if (optarg) + { + syslog(LOG_INFO, "read calib file %s",optarg); + if (sscanf (optarg, "%s", &fcalib) != 1) { + syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-a flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + syslog(LOG_INFO, "reading flag ants file %s",optarg); + if (sscanf (optarg, "%s", &fflagants) != 1) { + syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 's': + if (optarg) + { + sfreq = atof(optarg); + syslog(LOG_INFO, "start freq %g",sfreq); + break; + } + else + { + syslog(LOG_ERR,"-s flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + //DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'b': + bf=1; + syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + + // allocate device memory + dmem d; + initialize_device_memory(&d,bf); + + // set up for beamformer + FILE *ff; + int iii; + if (bf) { + + if (!(ff=fopen(fflagants,"r"))) { + syslog(LOG_ERR,"could not open flagants file\n"); + exit(1); + } + d.nflags=0; + while (!feof(ff)) { + fscanf(ff,"%d\n",&d.flagants[iii]); + d.nflags++; + } + fclose(ff); + + if (!(ff=fopen(fcalib,"rb"))) { + syslog(LOG_ERR,"could not open calibss file\n"); + exit(1); + } + fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); + fclose(ff); + + for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) + d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); + cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); + + // calculate weights + calc_weights(&d); + + } + + // test mode + FILE *fin, *fout; + uint64_t output_size; + char * output_data;//, * o1; + if (test) { + + // read one block of input data + d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + for (int i=0;i<512;i++) { + fin = fopen(ftest,"rb"); + fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); + fclose(fin); + } + + // run correlator or beamformer, and output data + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + + fout = fopen("output.dat","wb"); + fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); + fclose(fout); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); + + // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); + // o1 = (char *)malloc(output_size); + // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost); + + + + fout = fopen("output.dat","wb"); + fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); + //fwrite(o1,1,output_size,fout); + fclose(fout); + } + + + // free + free(d.h_input); + free(output_data); + //free(o1); + deallocate_device_memory(&d,bf); + + exit(1); + } + + + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + if (bf==0) + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); + else + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); + uint64_t bytes_read = 0; + //char * block; + char * output_buffer; + output_buffer = (char *)malloc(block_out); + uint64_t written, block_id; + + // get things started + bool observation_complete=0; + //bool started = 0; + syslog(LOG_INFO, "starting observation"); + int blocks = 0; + //clock_t begin, end; + //double time_spent; + + while (!observation_complete) { + + if (DEBUG) syslog(LOG_INFO,"reading block"); + + // open block + d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // do stuff + //begin = clock(); + // loop + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); + } + //end = clock(); + //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; + cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; + + // write to output + + // write to host + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + // loop end + + + // finish up + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + // finish up + free(output_buffer); + deallocate_device_memory(&d,bf); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + + return 0; + */ +} diff --git a/tests/dsaX_beamformer_correlator_test.cpp~ b/tests/dsaX_beamformer_correlator_test.cpp~ new file mode 100644 index 0000000..30184b3 --- /dev/null +++ b/tests/dsaX_beamformer_correlator_test.cpp~ @@ -0,0 +1,398 @@ +#include +#include +#include +#include +#include + +// Include the dsaX.h header in your application +//#include + +int main(int argc, char **argv) { + + // startup syslog message + // using LOG_LOCAL0 + openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); + syslog (LOG_NOTICE, "Program started by User %d", getuid ()); + + // DADA Header plus Data Unit + dada_hdu_t* hdu_in = 0; + dada_hdu_t* hdu_out = 0; + + // data block HDU keys + key_t in_key = REORDER_BLOCK_KEY; + key_t out_key = XGPU_BLOCK_KEY; + + // command line arguments + int core = -1; + int arg = 0; + int bf = 0; + int test = 0; + char ftest[200], fflagants[200], fcalib[200]; + float sfreq = 1498.75; + + while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) + { + switch (arg) + { + case 'c': + if (optarg) + { + core = atoi(optarg); + break; + } + else + { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) + { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) + { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) + { + test = 1; + syslog(LOG_INFO, "test mode"); + if (sscanf (optarg, "%s", &ftest) != 1) { + syslog(LOG_ERR, "could not read test file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'a': + if (optarg) + { + syslog(LOG_INFO, "read calib file %s",optarg); + if (sscanf (optarg, "%s", &fcalib) != 1) { + syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-a flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) + { + syslog(LOG_INFO, "reading flag ants file %s",optarg); + if (sscanf (optarg, "%s", &fflagants) != 1) { + syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 's': + if (optarg) + { + sfreq = atof(optarg); + syslog(LOG_INFO, "start freq %g",sfreq); + break; + } + else + { + syslog(LOG_ERR,"-s flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + //DEBUG=1; + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'b': + bf=1; + syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + // Bind to cpu core + if (core >= 0) { + if (dada_bind_thread_to_core(core) < 0) + syslog(LOG_ERR,"failed to bind to core %d", core); + syslog(LOG_NOTICE,"bound to core %d", core); + } + + /* + // allocate device memory + dmem d; + initialize_device_memory(&d,bf); + + // set up for beamformer + FILE *ff; + int iii; + if (bf) { + + if (!(ff=fopen(fflagants,"r"))) { + syslog(LOG_ERR,"could not open flagants file\n"); + exit(1); + } + d.nflags=0; + while (!feof(ff)) { + fscanf(ff,"%d\n",&d.flagants[iii]); + d.nflags++; + } + fclose(ff); + + if (!(ff=fopen(fcalib,"rb"))) { + syslog(LOG_ERR,"could not open calibss file\n"); + exit(1); + } + fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); + fclose(ff); + + for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) + d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); + cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); + + // calculate weights + calc_weights(&d); + + } + + // test mode + FILE *fin, *fout; + uint64_t output_size; + char * output_data;//, * o1; + if (test) { + + // read one block of input data + d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + for (int i=0;i<512;i++) { + fin = fopen(ftest,"rb"); + fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); + fclose(fin); + } + + // run correlator or beamformer, and output data + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + + fout = fopen("output.dat","wb"); + fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); + fclose(fout); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); + + // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); + // o1 = (char *)malloc(output_size); + // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost); + + + + fout = fopen("output.dat","wb"); + fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); + //fwrite(o1,1,output_size,fout); + fclose(fout); + } + + + // free + free(d.h_input); + free(output_data); + //free(o1); + deallocate_device_memory(&d,bf); + + exit(1); + } + + + + + // DADA stuff + + syslog (LOG_INFO, "creating in and out hdus"); + + hdu_in = dada_hdu_create (0); + dada_hdu_set_key (hdu_in, in_key); + if (dada_hdu_connect (hdu_in) < 0) { + syslog (LOG_ERR,"could not connect to dada buffer in"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_read (hdu_in) < 0) { + syslog (LOG_ERR,"could not lock to dada buffer in"); + return EXIT_FAILURE; + } + + hdu_out = dada_hdu_create (0); + dada_hdu_set_key (hdu_out, out_key); + if (dada_hdu_connect (hdu_out) < 0) { + syslog (LOG_ERR,"could not connect to output buffer"); + return EXIT_FAILURE; + } + if (dada_hdu_lock_write(hdu_out) < 0) { + syslog (LOG_ERR, "could not lock to output buffer"); + return EXIT_FAILURE; + } + + uint64_t header_size = 0; + + // deal with headers + char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); + if (!header_in) + { + syslog(LOG_ERR, "could not read next header"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) + { + syslog (LOG_ERR, "could not mark header block cleared"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + char * header_out = ipcbuf_get_next_write (hdu_out->header_block); + if (!header_out) + { + syslog(LOG_ERR, "could not get next header block [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + memcpy (header_out, header_in, header_size); + if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) + { + syslog (LOG_ERR, "could not mark header block filled [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); + + // get block sizes and allocate memory + uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); + uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); + syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + if (bf==0) + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); + else + syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); + uint64_t bytes_read = 0; + //char * block; + char * output_buffer; + output_buffer = (char *)malloc(block_out); + uint64_t written, block_id; + + // get things started + bool observation_complete=0; + //bool started = 0; + syslog(LOG_INFO, "starting observation"); + int blocks = 0; + //clock_t begin, end; + //double time_spent; + + while (!observation_complete) { + + if (DEBUG) syslog(LOG_INFO,"reading block"); + + // open block + d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); + + // do stuff + //begin = clock(); + // loop + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); + } + else { + if (DEBUG) syslog(LOG_INFO,"run beamformer"); + dbeamformer(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); + } + //end = clock(); + //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; + cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; + + // write to output + + // write to host + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); + if (written < block_out) + { + syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + return EXIT_FAILURE; + } + + if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); + blocks++; + // loop end + + + // finish up + if (bytes_read < block_size) + observation_complete = 1; + + ipcio_close_block_read (hdu_in->data_block, bytes_read); + + } + + // finish up + free(output_buffer); + deallocate_device_memory(&d,bf); + dsaX_dbgpu_cleanup (hdu_in, hdu_out); + + return 0; + */ +} diff --git a/utils/.gitignore b/tests/utils/.gitignore similarity index 100% rename from utils/.gitignore rename to tests/utils/.gitignore diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt new file mode 100644 index 0000000..226c9de --- /dev/null +++ b/tests/utils/CMakeLists.txt @@ -0,0 +1,11 @@ +# install step for utils +#------------------------------ +set(DSA_XENGINE_UTILS + # cmake-format: sortable + gen_packet.py + get_rms_packet.py + get_rms.py + sockets.py + ) +install(FILES ${DSA_XENGINE_UTILS} DESTINATION utils) +#------------------------------ diff --git a/tests/utils/CMakeLists.txt~ b/tests/utils/CMakeLists.txt~ new file mode 100644 index 0000000..ab053c5 --- /dev/null +++ b/tests/utils/CMakeLists.txt~ @@ -0,0 +1,22 @@ +# install step for utils +#------------------------------ +set(DSA_XENGINE_UTILS + # cmake-format: sortable +/home/dmhowart/DSA110/dsa110-xengine/src/dsaX_bfCorr.cu dsaX_capture.h + dsaX_capture_manythread.h + dsaX_capture_pcap.h + dsaX_def.h + dsaX_cutlass_interface.h + ) +install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) +#------------------------------ + +# install step for executables +#----------------------------- +install(TARGETS + # cmake-format: sortable + dsaX_bfCorr + RUNTIME DESTINATION + bin + ) +#----------------------------- diff --git a/utils/gen_packet.py b/tests/utils/gen_packet.py similarity index 100% rename from utils/gen_packet.py rename to tests/utils/gen_packet.py diff --git a/utils/get_rms.py b/tests/utils/get_rms.py similarity index 100% rename from utils/get_rms.py rename to tests/utils/get_rms.py diff --git a/utils/get_rms_packet.py b/tests/utils/get_rms_packet.py similarity index 100% rename from utils/get_rms_packet.py rename to tests/utils/get_rms_packet.py diff --git a/utils/packet.out b/tests/utils/packet.out similarity index 100% rename from utils/packet.out rename to tests/utils/packet.out diff --git a/utils/sockets.py b/tests/utils/sockets.py similarity index 100% rename from utils/sockets.py rename to tests/utils/sockets.py diff --git a/utils/test.out b/tests/utils/test.out similarity index 100% rename from utils/test.out rename to tests/utils/test.out From b7789e216bb6e56286fada15eabe874e69b803c5 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Mon, 24 Jun 2024 15:17:19 -0700 Subject: [PATCH 19/30] Rename psrdada header file, split beamformer and correlator files (correlator is now pure cpp code, independent of platform). Create a test file independent of platform and psrdada, restore accidentally deleted utils --- README.md | 9 +- include/CMakeLists.txt | 1 + include/dsaX.h | 32 +- include/dsaX_cublas_interface.h | 2 +- include/dsaX_cuda_interface.h | 27 +- include/dsaX_def.h | 3 +- ...psrdada_headers.h => dsaX_psrdada_utils.h} | 4 + include/dsaX_utils.h | 2 +- src/CMakeLists.txt | 20 +- src/dsaX_beamformer.cu | 168 +++++ src/dsaX_beamformer_correlator.cu | 612 ------------------ src/dsaX_blas_interface.cu | 2 +- src/dsaX_correlator.cpp | 59 ++ src/dsaX_cublas_interface.cu | 35 +- src/dsaX_cuda_interface.cu | 66 +- src/dsaX_utils.cpp | 9 + tests/CMakeLists.txt | 6 +- tests/dsaX_beamformer_correlator_test.cpp | 399 ------------ tests/dsaX_correlator_test.cpp | 195 ++++++ utils/gen_packet.py | 228 +++++++ utils/gen_testblock.py | 49 ++ utils/get_rms.py | 141 ++++ utils/get_rms_packet.py | 36 ++ utils/sockets.py | 31 + 24 files changed, 1024 insertions(+), 1112 deletions(-) rename include/{dsaX_psrdada_headers.h => dsaX_psrdada_utils.h} (70%) create mode 100644 src/dsaX_beamformer.cu delete mode 100644 src/dsaX_beamformer_correlator.cu create mode 100644 src/dsaX_correlator.cpp delete mode 100644 tests/dsaX_beamformer_correlator_test.cpp create mode 100644 tests/dsaX_correlator_test.cpp create mode 100644 utils/gen_packet.py create mode 100644 utils/gen_testblock.py create mode 100644 utils/get_rms.py create mode 100644 utils/get_rms_packet.py create mode 100644 utils/sockets.py diff --git a/README.md b/README.md index 4a27ba5..f771017 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,4 @@ Finally, `dsaX_dbnic` and `dsaX_nicdb` implement the corner turn to feed `mbheim ### scripts and utils -The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. - - - - - - - +The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 1bbdfda..a056a0f 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -4,6 +4,7 @@ enable_language(CUDA) #------------------------------ set(DSA_XENGINE_HEADERS # cmake-format: sortable + dsaX_cuda_interface.h dsaX_cuda_headers.h dsaX_capture.h dsaX_capture_manythread.h diff --git a/include/dsaX.h b/include/dsaX.h index 2ee856a..7cf23dc 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -1,24 +1,9 @@ #pragma once -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "dsaX_enums.h" #include "dsaX_cuda_headers.h" -#include "dsaX_psrdada_headers.h" // required to prevent overflow in corr matrix multiply #define halfFac 4 @@ -26,9 +11,6 @@ // beam sep #define sep 1.0 // arcmin -/* global variables */ -//#define DEBUG; - // define structure that carries around device memory typedef struct dmem { @@ -91,15 +73,3 @@ typedef struct dsaXBLASParam_s { dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ } dsaXBLASParam; - - -// Initialise device memory -void initialize_device_memeory(dmem * d, int bf); - -// Deallocate device memory -void deallocate(dmem * d, int bf); - -void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); - -// ? -int dada_bind_thread_to_core(int core); diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h index 9265f37..7ad8b31 100644 --- a/include/dsaX_cublas_interface.h +++ b/include/dsaX_cublas_interface.h @@ -2,4 +2,4 @@ #include "dsaX.h" #include "dsaX_cuda_headers.h" -void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param); +void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam param); diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h index 99b1db2..c8ea8aa 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/dsaX_cuda_interface.h @@ -1,31 +1,32 @@ #pragma once -#include "dsaX.h" #include "dsaX_def.h" +#include "dsaX.h" -void initialize_device_memory(dmem * d, int bf); +#ifdef DSA_XENGINE_TARGET_CUDA +void initialize_device_memory(dmem *d, int bf); -void deallocate_device_memory(dmem * d, int bf); +void deallocate_device_memory(dmem *d, int bf); -void reorder_output_device(dmem * d); +void reorder_output_device(dmem *d); __global__ void corr_input_copy(char *input, half *inr, half *ini); -template __global__ void transpose_matrix(in_prec * idata, out_prec * odata); +template __global__ void transpose_matrix(in_prec *idata, out_prec *odata); -void reorder_input_device(char *input, char * tx, half *inr, half *ini); +void reorder_input_device(char *input, char *tx, half *inr, half *ini); __global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup); -__global__ void transpose_input_bf(double * idata, double * odata); - -__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs); +__global__ void transpose_input_bf(double *idata, double *odata); -void calc_weights(dmem * d); +__global__ void populate_weights_matrix(float *antpos_e, float *antpos_n, float *calibs, half *wr, half *wi, float *fqs); -__global__ void fluff_input_bf(char * input, half * dr, half * di); +void calc_weights(dmem *d); -__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata); +__global__ void fluff_input_bf(char *input, half *dr, half *di); -__global__ void sum_beam(unsigned char * input, float * output); +__global__ void transpose_scale_bf(half *ir, half *ii, unsigned char *odata); +__global__ void sum_beam(unsigned char *input, float *output); +#endif diff --git a/include/dsaX_def.h b/include/dsaX_def.h index c23ed15..257f493 100644 --- a/include/dsaX_def.h +++ b/include/dsaX_def.h @@ -1,7 +1,5 @@ #pragma once -#include "dada_def.h" - // default dada block keys #define TEST_BLOCK_KEY 0x0000aada // for capture program. // 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER @@ -38,6 +36,7 @@ #define XGPU_IN_INC 1 // size of input increment #define NBASE 4656 // nant*(nant+1)/2 #define NPOL 2 +#define NCOMPLEX 2 // two reals per complex #define NCHAN 1536 // regardless of NCHANG // default port for packet capture diff --git a/include/dsaX_psrdada_headers.h b/include/dsaX_psrdada_utils.h similarity index 70% rename from include/dsaX_psrdada_headers.h rename to include/dsaX_psrdada_utils.h index 325dcb8..2dc3dec 100644 --- a/include/dsaX_psrdada_headers.h +++ b/include/dsaX_psrdada_utils.h @@ -10,3 +10,7 @@ #include "ascii_header.h" #include "dsaX_def.h" #include "dsaX_enums.h" + +void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); + +int dada_bind_thread_to_core(int core); diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h index 3976db7..5d39861 100644 --- a/include/dsaX_utils.h +++ b/include/dsaX_utils.h @@ -6,4 +6,4 @@ void dsaXmemset(void *array, int ch, size_t n); void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n); void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n); - +void dsaXmemcpyDeviceToDevice(void *array_device_to, void *array_device_from, size_t n); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e23827b..c73743a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,9 +6,12 @@ include_directories(../include) #----------------------------- add_library(dsa SHARED dsaX_cuda_interface.cu + dsaX_cublas_interface.cu dsaX_blas_interface.cu - dsaX_beamformer_correlator.cu + dsaX_beamformer.cu + dsaX_correlator.cpp dsaX_utils.cpp + dsaX_psrdada_utils.cpp ) if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) @@ -19,13 +22,13 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU) endif() if(CUDAToolkit_FOUND) - target_link_libraries(dsa CUDA::cudart) + target_link_libraries(dsa PUBLIC CUDA::cudart) endif() if(DSA_XENGINE_ENABLE_PSRDADA) include_directories(${PSRDada_SOURCE_DIR}/src) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) - target_link_libraries(dsa ${PSRDada_LIB}) + target_link_libraries(dsa PUBLIC ${PSRDada_LIB}) endif() if(DSA_XENGINE_ENABLE_XGPU) @@ -58,11 +61,6 @@ if(DSA_XENGINE_ENABLE_CUTLASS) target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB}) #--------------------------------------- endif() - -if(CUDAToolkit_FOUND) - #add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu) - #target_link_libraries(dsaX_beamformer_correlator ${dsa} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) -endif() #--------------------- # install step for libraray @@ -75,7 +73,6 @@ install(TARGETS ) #----------------------------- - # install step for executables #----------------------------- install(TARGETS @@ -85,3 +82,8 @@ install(TARGETS bin ) #----------------------------- + +if(CUDAToolkit_FOUND) + add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu) + target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsa ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) +endif() diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu new file mode 100644 index 0000000..0d7b1df --- /dev/null +++ b/src/dsaX_beamformer.cu @@ -0,0 +1,168 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ + +#include + +#include "dsaX_def.h" +#include "dsaX.h" +#include "dsaX_blas_interface.h" +#include "dsaX_utils.h" +#include "dsaX_psrdada_utils.h" +#ifdef DSA_XENGINE_TARGET_CUDA +#include "dsaX_cuda_interface.h" +#endif + +using namespace std; + +int DEBUG = 1; + +void usage() { + fprintf (stdout, + "dsaX_beamformer_correlator [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -b run beamformer [default is to run correlator]\n" + " -h print usage\n" + " -t binary file for test mode\n" + " -f flagants file\n" + " -a calib file\n" + " -s start frequency (assumes -0.244140625MHz BW)\n"); +} + + +/* +Beamformer: + - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] +(single transpose operation) + - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 + - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) + - transpose and done! + +*/ +// beamformer function +void dbeamformer(dmem *d) { + + // gemm settings - recall column major order assumed + // stride over 48 chans + cublasHandle_t cublasH = NULL; + cublasCreate(&cublasH); + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + const int m = NPACKETS_PER_BLOCK/4; + const int n = NBEAMS/2; + const int k = 4*(NANTS/2)*8*2*2; + const half alpha = 1.; + const half malpha = -1.; + const int lda = k; + const int ldb = k; + const half beta0 = 0.; + const half beta1 = 1.; + const int ldc = m; + const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; + const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2; + const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; + const int batchCount = NCHAN_PER_PACKET/8; + long long int i1, i2;//, o1; + + // create streams + cudaStream_t stream; + cudaStreamCreate(&stream); + + // timing + // copy, prepare, cublas, output + clock_t begin, end; + + // do big memcpy + begin = clock(); + dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // loop over halves of the array + for (int iArm=0;iArm<2;iArm++) { + + // zero out output arrays + dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); + cudaDeviceSynchronize(); + + // copy data to device + // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + // final data: need to split by NANTS. + begin = clock(); + for (i1=0; i1d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4, + d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4, + (NANTS/2)*NCHAN_PER_PACKET*4); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // do reorder and fluff of data to real and imag + begin = clock(); + + dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); + transpose_input_bf<<< dimGrid1, dimBlock1 >>>((double *)(d->d_input), (double *)(d->d_tx)); + fluff_input_bf<<>>(d->d_tx, d->d_br, d->d_bi); + + end = clock(); + d->prep += (float)(end - begin) / CLOCKS_PER_SEC; + + // large matrix multiply to get real and imag outputs + // set up for gemm + cublasSetStream(cublasH, stream); + i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset + + // run strided batched gemm + begin = clock(); + // ac + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_br,lda,strideA, + d->weights_r+i2,ldb,strideB,&beta0, + d->d_bigbeam_r,ldc,strideC, + batchCount); + // -bd + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &malpha,d->d_bi,lda,strideA, + d->weights_i+i2,ldb,strideB,&beta1, + d->d_bigbeam_r,ldc,strideC, + batchCount); + // bc + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_bi,lda,strideA, + d->weights_r+i2,ldb,strideB,&beta0, + d->d_bigbeam_i,ldc,strideC, + batchCount); + // ad + cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, + &alpha,d->d_br,lda,strideA, + d->weights_i+i2,ldb,strideB,&beta1, + d->d_bigbeam_i,ldc,strideC, + batchCount); + + cudaDeviceSynchronize(); + end = clock(); + d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; + + // simple formation of total power and scaling to 8-bit in transpose kernel + begin = clock(); + dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16); + transpose_scale_bf<<>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + end = clock(); + d->outp += (float)(end - begin) / CLOCKS_PER_SEC; + } + + cudaStreamDestroy(stream); + cublasDestroy(cublasH); + + // form sum over times + //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf); +} diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu deleted file mode 100644 index ddbc73c..0000000 --- a/src/dsaX_beamformer_correlator.cu +++ /dev/null @@ -1,612 +0,0 @@ -// -*- c++ -*- -/* assumes input and output block size is appropriate - will seg fault otherwise*/ -/* -Workflow is similar for BF and corr applications - - copy data to GPU, convert to half-precision and calibrate while reordering - - do matrix operations to populate large output vector - */ - -#include "dsaX_def.h" -#include "dsaX.h" -#include "dsaX_blas_interface.h" -#include "dsaX_utils.h" -#include "dsaX_blas_interface.h" -#ifdef DSA_XENGINE_TARGET_CUDA -#include "dsaX_cuda_interface.h" -#endif - -int DEBUG = 1; - -void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out) -{ - if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in"); - dada_hdu_destroy (in); - - if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out"); - dada_hdu_destroy (out); - -} - -void usage() { - fprintf (stdout, - "dsaX_beamformer_correlator [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -b run beamformer [default is to run correlator]\n" - " -h print usage\n" - " -t binary file for test mode\n" - " -f flagants file\n" - " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); -} - -// correlator function -// workflow: copy to device, reorder, stridedBatchedGemm, reorder -void dcorrelator(dmem *d) { - - // copy to device - dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - - // zero out output arrays - dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); - - // reorder input - reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i); - - dsaXBLASParam blas_param; - // gemm settings - // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] - // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] - blas_param.trans_a = DSA_BLAS_OP_N; - blas_param.trans_b = DSA_BLAS_OP_T; - blas_param.m = NANTS; - blas_param.n = NANTS; - blas_param.k = NPACKETS_PER_BLOCK/halfFac; - blas_param.alpha = 1.0; - blas_param.lda = blas_param.m; - blas_param.ldb = blas_param.n; - blas_param.beta = 0.; - blas_param.ldc = blas_param.m; - blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; - blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; - blas_param.c_stride = NANTS*NANTS; - blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; - - // Perform GEMM accoring to back end configuration - dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); - - /* - // ABSTRACT HERE START - // ABSTRACT HERE END - */ - - // reorder output data - reorder_output_device(d); -} - -/* -Beamformer: - - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] -(single transpose operation) - - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 - - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) - - transpose and done! - -*/ -// beamformer function -void dbeamformer(dmem * d) { - - // gemm settings - recall column major order assumed - // stride over 48 chans - cublasHandle_t cublasH = NULL; - cublasCreate(&cublasH); - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - const int m = NPACKETS_PER_BLOCK/4; - const int n = NBEAMS/2; - const int k = 4*(NANTS/2)*8*2*2; - const half alpha = 1.; - const half malpha = -1.; - const int lda = k; - const int ldb = k; - const half beta0 = 0.; - const half beta1 = 1.; - const int ldc = m; - const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; - const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2; - const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; - const int batchCount = NCHAN_PER_PACKET/8; - long long int i1, i2;//, o1; - - // create streams - cudaStream_t stream; - cudaStreamCreate(&stream); - - // timing - // copy, prepare, cublas, output - clock_t begin, end; - - // do big memcpy - begin = clock(); - dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4); - end = clock(); - d->cp += (float)(end - begin) / CLOCKS_PER_SEC; - - // loop over halves of the array - for (int iArm=0;iArm<2;iArm++) { - - // zero out output arrays - dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); - dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); - cudaDeviceSynchronize(); - - // copy data to device - // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - // final data: need to split by NANTS. - begin = clock(); - for (i1=0;i1d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,(NANTS/2)*NCHAN_PER_PACKET*4,cudaMemcpyDeviceToDevice); - end = clock(); - d->cp += (float)(end - begin) / CLOCKS_PER_SEC; - - // do reorder and fluff of data to real and imag - begin = clock(); - dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); - transpose_input_bf<<>>((double *)(d->d_input),(double *)(d->d_tx)); - fluff_input_bf<<>>(d->d_tx,d->d_br,d->d_bi); - end = clock(); - d->prep += (float)(end - begin) / CLOCKS_PER_SEC; - - // large matrix multiply to get real and imag outputs - // set up for gemm - cublasSetStream(cublasH, stream); - i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset - - // run strided batched gemm - begin = clock(); - // ac - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_br,lda,strideA, - d->weights_r+i2,ldb,strideB,&beta0, - d->d_bigbeam_r,ldc,strideC, - batchCount); - // -bd - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &malpha,d->d_bi,lda,strideA, - d->weights_i+i2,ldb,strideB,&beta1, - d->d_bigbeam_r,ldc,strideC, - batchCount); - // bc - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_bi,lda,strideA, - d->weights_r+i2,ldb,strideB,&beta0, - d->d_bigbeam_i,ldc,strideC, - batchCount); - // ad - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_br,lda,strideA, - d->weights_i+i2,ldb,strideB,&beta1, - d->d_bigbeam_i,ldc,strideC, - batchCount); - - cudaDeviceSynchronize(); - end = clock(); - d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; - - - // simple formation of total power and scaling to 8-bit in transpose kernel - begin = clock(); - dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16); - transpose_scale_bf<<>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - end = clock(); - d->outp += (float)(end - begin) / CLOCKS_PER_SEC; - } - - cudaStreamDestroy(stream); - cublasDestroy(cublasH); - - // form sum over times - //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf); - -} - - -// MAIN -#if 0 -int main (int argc, char *argv[]) { - - cudaSetDevice(0); - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - /* DADA Header plus Data Unit */ - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = REORDER_BLOCK_KEY; - key_t out_key = XGPU_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg = 0; - int bf = 0; - int test = 0; - char ftest[200], fflagants[200], fcalib[200]; - float sfreq = 1498.75; - - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - test = 1; - syslog(LOG_INFO, "test mode"); - if (sscanf (optarg, "%s", &ftest) != 1) { - syslog(LOG_ERR, "could not read test file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'a': - if (optarg) - { - syslog(LOG_INFO, "read calib file %s",optarg); - if (sscanf (optarg, "%s", &fcalib) != 1) { - syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-a flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - syslog(LOG_INFO, "reading flag ants file %s",optarg); - if (sscanf (optarg, "%s", &fflagants) != 1) { - syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 's': - if (optarg) - { - sfreq = atof(optarg); - syslog(LOG_INFO, "start freq %g",sfreq); - break; - } - else - { - syslog(LOG_ERR,"-s flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - //DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'b': - bf=1; - syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) - { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - // allocate device memory - dmem d; - initialize_device_memory(&d,bf); - - // set up for beamformer - FILE *ff; - int iii; - if (bf) { - - if (!(ff=fopen(fflagants,"r"))) { - syslog(LOG_ERR,"could not open flagants file\n"); - exit(1); - } - d.nflags=0; - while (!feof(ff)) { - fscanf(ff,"%d\n",&d.flagants[iii]); - d.nflags++; - } - fclose(ff); - - if (!(ff=fopen(fcalib,"rb"))) { - syslog(LOG_ERR,"could not open calibss file\n"); - exit(1); - } - fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); - fclose(ff); - - for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) - d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); - dsaXmemcpyHostToDevice(d.d_freqs, d.h_freqs, sizeof(float)*(NCHAN_PER_PACKET/8)); - - // calculate weights - calc_weights(&d); - - } - - // test mode - FILE *fin, *fout; - uint64_t output_size; - char * output_data;//, * o1; - if (test) { - - // read one block of input data - d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - for (int i=0;i<512;i++) { - fin = fopen(ftest,"rb"); - fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); - fclose(fin); - } - - // run correlator or beamformer, and output data - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); - dsaXmemcpyDeviceToHost(output_data, d.d_output, output_size); - - fout = fopen("output.dat","wb"); - fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); - fclose(fout); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; - output_data = (char *)malloc(output_size); - dsaXmemcpyDeviceToHost(output_data, d.d_bigpower, output_size); - - /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); - o1 = (char *)malloc(output_size); - cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/ - - - - fout = fopen("output.dat","wb"); - fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); - //fwrite(o1,1,output_size,fout); - fclose(fout); - } - - - // free - free(d.h_input); - free(output_data); - //free(o1); - deallocate_device_memory(&d,bf); - - exit(1); - } - - - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - if (bf==0) - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); - else - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); - uint64_t bytes_read = 0; - //char * block; - char * output_buffer; - output_buffer = (char *)malloc(block_out); - uint64_t written, block_id; - - // get things started - bool observation_complete=0; - //bool started = 0; - syslog(LOG_INFO, "starting observation"); - int blocks = 0; - //clock_t begin, end; - //double time_spent; - - while (!observation_complete) { - - if (DEBUG) syslog(LOG_INFO,"reading block"); - - // open block - d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // do stuff - //begin = clock(); - // loop - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - dsaXmemcpyDeviceToHost(output_buffer, d.d_output, block_out); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - dsaMXmemcpyDeviceToHost(output_buffer, d.d_bigpower, block_out); - } - //end = clock(); - //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; - cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; - - // write to output - - // write to host - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - // loop end - - - // finish up - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - // finish up - free(output_buffer); - deallocate_device_memory(&d,bf); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - -} -#endif - diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu index 430ba9e..7e49fcb 100644 --- a/src/dsaX_blas_interface.cu +++ b/src/dsaX_blas_interface.cu @@ -3,7 +3,7 @@ void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) { #ifdef DSA_XENGINE_TARGET_CUDA - dsaXHgemmStridedBatchedCuda(real_in, imag_in, real_out, imag_out, param); + dsaXHgemmStridedBatchedCuda((half*)real_in, (half*)imag_in, (half*)real_out, (half*)imag_out, param); #else std::cout "Not implemented" << std::endl; exit(0); diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp new file mode 100644 index 0000000..d2223f5 --- /dev/null +++ b/src/dsaX_correlator.cpp @@ -0,0 +1,59 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ + +#include + +#include "dsaX_def.h" +#include "dsaX.h" +#include "dsaX_blas_interface.h" +#include "dsaX_utils.h" +#include "dsaX_psrdada_utils.h" +#include "dsaX_cuda_interface.h" + +// correlator function +// workflow: copy to device, reorder, stridedBatchedGemm, reorder +// DMH CUDA references excised +void dcorrelator(dmem *d) { + + // copy to device + dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + + // zero out output arrays + dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + + // reorder input + reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i); + + dsaXBLASParam blas_param; + // gemm settings + // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] + blas_param.trans_a = DSA_BLAS_OP_N; + blas_param.trans_b = DSA_BLAS_OP_T; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + + // Perform GEMM accoring to back end configuration + dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); + + // reorder output data + reorder_output_device(d); + +} diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu index 4631516..df6b3de 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/dsaX_cublas_interface.cu @@ -1,6 +1,9 @@ +#include #include "dsaX_cublas_interface.h" -void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) { +using namespace std; + +void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam blas_param) { #ifdef DSA_XENGINE_TARGET_CUDA // not sure if essential @@ -24,7 +27,7 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v case DSA_BLAS_OP_C: transa = CUBLAS_OP_C; break; default: - std::cout << "Unknown cublas transpose" << std::end; + std::cout << "Unknown cublas transpose" << std::endl; } switch (blas_param.trans_b) { @@ -35,14 +38,14 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v case DSA_BLAS_OP_C: transb = CUBLAS_OP_C; break; default: - std::cout << "Unknown cublas transpose" << std::end; + std::cout << "Unknown cublas transpose" << std::endl; } const int m = blas_param.m; const int n = blas_param.n; const int k = blas_param.k; const half alpha = blas_param.alpha.real(); - const half malpha = -1.0 * alpha; + const half malpha = -1.0 * blas_param.alpha.real(); const int lda = blas_param.lda; const int ldb = blas_param.ldb; const half beta0 = blas_param.beta.real(); @@ -56,27 +59,27 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v // run strided batched gemm for datatype (a + ib)(c + id) // ac cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_r,lda,strideA, - d->d_r,ldb,strideB,&beta0, - d->d_outr,ldc,strideC, + &alpha,real_in,lda,strideA, + real_in,ldb,strideB,&beta0, + real_out,ldc,strideC, batchCount); // bd cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_i,lda,strideA, - d->d_i,ldb,strideB,&beta1, - d->d_outr,ldc,strideC, + &alpha,imag_in,lda,strideA, + imag_in,ldb,strideB,&beta1, + real_out,ldc,strideC, batchCount); // -bc cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &malpha,d->d_i,lda,strideA, - d->d_r,ldb,strideB,&beta0, - d->d_outi,ldc,strideC, + &malpha,imag_in,lda,strideA, + real_in,ldb,strideB,&beta0, + imag_out,ldc,strideC, batchCount); // ad cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_r,lda,strideA, - d->d_i,ldb,strideB,&beta1, - d->d_outi,ldc,strideC, + &alpha,real_in,lda,strideA, + imag_in,ldb,strideB,&beta1, + imag_out,ldc,strideC, batchCount); // shown to be essential diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index 31e44d0..d1f77a4 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -1,7 +1,7 @@ #include "dsaX_cuda_interface.h" // allocate device memory -void initialize_device_memory(dmem * d, int bf) { +void initialize_device_memory(dmem *d, int bf) { // for correlator if (bf==0) { @@ -45,9 +45,8 @@ void initialize_device_memory(dmem * d, int bf) { } } - // deallocate device memory -void deallocate_device_memory(dmem * d, int bf) { +void deallocate_device_memory(dmem *d, int bf) { cudaFree(d->d_input); @@ -149,25 +148,49 @@ void reorder_output_device(dmem * d) { fout=fopen("test3.test","wb"); fwrite(odata,sizeof(char),384*4*NBASE*4,fout); fclose(fout);*/ - cudaFree(d_idxs); free(h_idxs); //cudaStreamDestroy(stream); - } // kernel to fluff input // run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks __global__ void corr_input_copy(char *input, half *inr, half *ini) { - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 - int tidx = threadIdx.x; // assume 128 + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 + int tidx = threadIdx.x; // assume 128 threads per block int iidx = bidx*128+tidx; - - inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); - ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr + // to get real part 4 bit data. + // 0000rrrr + // Bit shift this result by 4 to the left. + // rrrr0000 + // Cast to signed char. + // +-rrr0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000rrr + // Cast to float and use CUDA intrinsic to cast to signed half + inr[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(15) ) << 4) >> 4)); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); + + // Both results should be half (FP16) integers between -8 and 7. + half re = inr[iidx]; + half im = ini[iidx]; + half lim = 2.; + if( (re > lim || re < -lim) || (im > lim || im < -lim)) { + //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); + } } // transpose kernel @@ -206,8 +229,8 @@ void reorder_input_device(char *input, char * tx, half *inr, half *ini) { // transpose input data dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix<<>>(input,tx); - corr_input_copy<<>>(tx,inr,ini); + transpose_matrix<<>>(input, tx); + corr_input_copy<<>>(tx, inr, ini); } // kernel to help with reordering output @@ -227,7 +250,8 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind int pol = (int)(chpol % 2); float v1=0., v2=0.; - + + // Use CUDA casting intrinsic __half2float for (int i=0;i> 4))); di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + + // Both results should be half (FP16) integers between -8 and 7. + //half re = dr[idx]; + //half im = di[idx]; + //half lim = 0; + //if( (re > lim || re < -lim) || (im > lim || im < -lim)) { + //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); + //} + } diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp index 46abfc9..fc0345a 100644 --- a/src/dsaX_utils.cpp +++ b/src/dsaX_utils.cpp @@ -28,3 +28,12 @@ void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){ memcpy(array_host, array_device, n); #endif } + +void dsaXmemcpyDeviceToDevice(void *array_copy_to, void *array_copy_from, size_t n){ +#ifdef DSA_XENGINE_TARGET_CUDA + // Perform device to device memcopy on data + cudaMemcpy(array_copy_to, array_copy_from, n, cudaMemcpyDeviceToDevice); +#else + memcpy(array_copy_to, array_copy_from, n); +#endif +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9d29854..4a45a24 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ - -#include_directories(../include) +#DMH: fix include path +include_directories(../include) include_directories(${CLI11_SOURCE_DIR}/include/CLI) -add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp) +add_executable(dsaX_correlator_test dsaX_correlator_test.cpp) diff --git a/tests/dsaX_beamformer_correlator_test.cpp b/tests/dsaX_beamformer_correlator_test.cpp deleted file mode 100644 index 3e723d0..0000000 --- a/tests/dsaX_beamformer_correlator_test.cpp +++ /dev/null @@ -1,399 +0,0 @@ -#include -#include -#include -#include -#include - -// Include the dsaX.h header in your application -//#include - -int main(int argc, char **argv) { - - /* - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // DADA Header plus Data Unit - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = REORDER_BLOCK_KEY; - key_t out_key = XGPU_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg = 0; - int bf = 0; - int test = 0; - char ftest[200], fflagants[200], fcalib[200]; - float sfreq = 1498.75; - - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - test = 1; - syslog(LOG_INFO, "test mode"); - if (sscanf (optarg, "%s", &ftest) != 1) { - syslog(LOG_ERR, "could not read test file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'a': - if (optarg) - { - syslog(LOG_INFO, "read calib file %s",optarg); - if (sscanf (optarg, "%s", &fcalib) != 1) { - syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-a flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - syslog(LOG_INFO, "reading flag ants file %s",optarg); - if (sscanf (optarg, "%s", &fflagants) != 1) { - syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 's': - if (optarg) - { - sfreq = atof(optarg); - syslog(LOG_INFO, "start freq %g",sfreq); - break; - } - else - { - syslog(LOG_ERR,"-s flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - //DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'b': - bf=1; - syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - - // allocate device memory - dmem d; - initialize_device_memory(&d,bf); - - // set up for beamformer - FILE *ff; - int iii; - if (bf) { - - if (!(ff=fopen(fflagants,"r"))) { - syslog(LOG_ERR,"could not open flagants file\n"); - exit(1); - } - d.nflags=0; - while (!feof(ff)) { - fscanf(ff,"%d\n",&d.flagants[iii]); - d.nflags++; - } - fclose(ff); - - if (!(ff=fopen(fcalib,"rb"))) { - syslog(LOG_ERR,"could not open calibss file\n"); - exit(1); - } - fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); - fclose(ff); - - for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) - d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); - cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); - - // calculate weights - calc_weights(&d); - - } - - // test mode - FILE *fin, *fout; - uint64_t output_size; - char * output_data;//, * o1; - if (test) { - - // read one block of input data - d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - for (int i=0;i<512;i++) { - fin = fopen(ftest,"rb"); - fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); - fclose(fin); - } - - // run correlator or beamformer, and output data - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); - - fout = fopen("output.dat","wb"); - fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); - fclose(fout); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); - - // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); - // o1 = (char *)malloc(output_size); - // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost); - - - - fout = fopen("output.dat","wb"); - fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); - //fwrite(o1,1,output_size,fout); - fclose(fout); - } - - - // free - free(d.h_input); - free(output_data); - //free(o1); - deallocate_device_memory(&d,bf); - - exit(1); - } - - - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - if (bf==0) - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); - else - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); - uint64_t bytes_read = 0; - //char * block; - char * output_buffer; - output_buffer = (char *)malloc(block_out); - uint64_t written, block_id; - - // get things started - bool observation_complete=0; - //bool started = 0; - syslog(LOG_INFO, "starting observation"); - int blocks = 0; - //clock_t begin, end; - //double time_spent; - - while (!observation_complete) { - - if (DEBUG) syslog(LOG_INFO,"reading block"); - - // open block - d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // do stuff - //begin = clock(); - // loop - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); - } - //end = clock(); - //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; - cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; - - // write to output - - // write to host - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - // loop end - - - // finish up - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - // finish up - free(output_buffer); - deallocate_device_memory(&d,bf); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - - return 0; - */ -} diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp new file mode 100644 index 0000000..b0560fc --- /dev/null +++ b/tests/dsaX_correlator_test.cpp @@ -0,0 +1,195 @@ +#include //DMH: replace with CLI +#include +#include +#include +#include +#include +#include +#include + +// Include the dsaX_interface.h header in your application +#include + +using namespace std; + +void usage() { + fprintf (stdout, + "dsaX_beamformer_correlator [options]\n" + " -c if dsaX is CUDA enabled, use this GPU" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -h print usage\n" + " -t binary file for test mode\n" + " -f flagants file\n" + " -a calib file\n" + " -s start frequency (assumes -0.244140625MHz BW)\n"); +} + +int main(int argc, char **argv) { + + // data block HDU keys + key_t in_key = REORDER_BLOCK_KEY; + key_t out_key = XGPU_BLOCK_KEY; + + // command line arguments + int device_ordinal = 0; + int arg = 0; + int bf = 0; + char ftest[200], fflagants[200], fcalib[200]; + float sfreq = 1498.75; + + while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) { + switch (arg) { + case 'c': + if (optarg) { + device_ordinal = atoi(optarg); + break; + } + else { + syslog(LOG_ERR,"-c flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'i': + if (optarg) { + if (sscanf (optarg, "%x", &in_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } else { + syslog(LOG_ERR,"-i flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'o': + if (optarg) { + if (sscanf (optarg, "%x", &out_key) != 1) { + syslog(LOG_ERR, "could not parse key from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } else { + syslog(LOG_ERR,"-o flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 't': + if (optarg) { + syslog(LOG_INFO, "test mode"); + if (sscanf (optarg, "%s", &ftest) != 1) { + syslog(LOG_ERR, "could not read test file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } else { + syslog(LOG_ERR,"-t flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'a': + if (optarg) { + syslog(LOG_INFO, "read calib file %s",optarg); + if (sscanf (optarg, "%s", &fcalib) != 1) { + syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } + else { + syslog(LOG_ERR,"-a flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'f': + if (optarg) { + syslog(LOG_INFO, "reading flag ants file %s",optarg); + if (sscanf (optarg, "%s", &fflagants) != 1) { + syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); + return EXIT_FAILURE; + } + break; + } else + { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 's': + if (optarg) { + sfreq = atof(optarg); + syslog(LOG_INFO, "start freq %g",sfreq); + break; + } + else { + syslog(LOG_ERR,"-s flag requires argument"); + usage(); + return EXIT_FAILURE; + } + case 'd': + syslog (LOG_DEBUG, "Will excrete all debug messages"); + break; + case 'h': + usage(); + return EXIT_SUCCESS; + } + } + + std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl; + std::cout << "NCHAN = " << NCHAN << std::endl; + std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl; + std::cout << "NPOL = " << NPOL << std::endl; + std::cout << "NARM = " << 2 << std::endl; + unsigned long long size = sizeof(char); + size *= NPACKETS_PER_BLOCK; + size *= NANTS; + size *= NCHAN_PER_PACKET; + size *= NPOL; + size *= NCOMPLEX; + std::cout << "(bytes) char size * NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX = " << size << std::endl; + std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; + std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; + +#if 0 + dsaX_init(); + + // allocate device memory + dmem d; + initialize_device_memory(&d, bf); + + FILE *fin, *fout; + uint64_t output_size; + char * output_data; + + // read one block of input data + d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + for (int i=0;i<512;i++) { + fin = fopen(ftest,"rb"); + fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); + fclose(fin); + } + + // run correlator or beamformer, and output data + syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + syslog(LOG_INFO,"copy to host"); + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + + fout = fopen("output.dat","wb"); + fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); + fclose(fout); + + // free + free(d.h_input); + free(output_data); + //free(o1); + deallocate_device_memory(&d,bf); + dsaX_end(); + + return 0; +#endif +} + diff --git a/utils/gen_packet.py b/utils/gen_packet.py new file mode 100644 index 0000000..7ae8ab4 --- /dev/null +++ b/utils/gen_packet.py @@ -0,0 +1,228 @@ +import numpy as np, struct +import matplotlib.pyplot as plt + + +''' The aim here is to make two types of data packets: + - one with a tone at a particular frequency and set of antennas + - one with pure noise + +Structure is 3 ant, 384 chan, 2 time, 2 pol, r/i +4608 bytes long + +''' + + +def make_spectrum(packet,ant=0,pol=0): + + spec = np.zeros(384*2) + + d = np.asarray(struct.unpack('>4608B',packet)) + + # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped + d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() + + d_r = ((d & 15) << 4) + d_i = d & 240 + d_r = d_r.astype(np.int8)/16 + d_i = d_i.astype(np.int8)/16 + + spec += d_r**2.+d_i**2. + spec = spec.reshape((384,2)).mean(axis=1) + return(spec) + +def plot_spectrum(data,ant=0,pol=0): + + spec = make_spectrum(data,ant=ant,pol=pol) + plt.plot(spec) + plt.xlabel('Channel') + plt.ylabel('Power') + plt.show() + +def make_histogram(packet): + ''' Makes histogram of packet - tested + ''' + + histo = np.zeros(16) + rms = 0. + + d = np.asarray(struct.unpack('>4608B',packet)) + + # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped + d = (d.reshape((3,384,2,2))).ravel() + + d_r = ((d & 15) << 4) + d_i = d & 240 + d_r = d_r.astype(np.int8)/16 + d_i = d_i.astype(np.int8)/16 + + rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.) + + hx = np.arange(16)-8 + + for i in range(384*2): + + histo[int(d_r[i])+8] += 1. + histo[int(d_i[i])+8] += 1. + + return(hx,histo/np.max(histo),np.sqrt(rms)) + +def histo_test(data): + + hx,histo,rms = make_histogram(data) + print('HISTOGRAM: ') + for i in range(16): + print(hx[i],histo[i]) + print() + print('RMS = ',rms) + print() + + +########## MAIN ############ + +# defaults +outfile = 'packet.out' +n_packet = 4608 # 4608 for single packet + +# decide which sort of packet to make +noise = False +tone = True +x16 = False + +# if tone +if tone is True: + + # defaults: + chans = np.arange(384)#np.asarray([10,100,190]) + #ant = 1 + amp_A = 9.0 + amp_B = 4. + + # derived quantities + amp_A = 16.*np.sqrt(amp_A) + amp_B = 16.*np.sqrt(amp_B) + ph = 2.*np.pi*np.random.uniform() + ramp_A = amp_A*np.cos(ph) + iamp_A = amp_A*np.sin(ph) + ph = 2.*np.pi*np.random.uniform() + ramp_B = amp_B*np.cos(ph) + iamp_B = amp_B*np.sin(ph) + + # make packet + real_part = np.zeros(n_packet,dtype='int8') + imag_part = np.zeros(n_packet,dtype='int8') + for ant in [0,1,2]: # 3 antennae + for i in chans: # 384 channels + + # time 1 pol A + j = int(1536*ant + i*4) + real_part[j] = round(ramp_A) + imag_part[j] = round(iamp_A) + + # time 1 pol B + j = int(1536*ant + i*4 + 1) + real_part[j] = round(ramp_B) + imag_part[j] = round(iamp_B) + + # time 2 pol A + j = int(1536*ant + i*4 + 2) + real_part[j] = round(ramp_A) + imag_part[j] = round(iamp_A) + + # time 2 pol B + j = int(1536*ant + i*4 + 3) + real_part[j] = round(ramp_B) + imag_part[j] = round(iamp_B) + + + # make 4-bit versions + real_part = np.cast['uint8'](real_part) + imag_part = np.cast['uint8'](imag_part) + for i in range(n_packet): + real_part[i] = real_part[i] >> 4 + imag_part[i] = (imag_part[i] >> 4) << 4 + + # finish packet + packet = np.zeros(n_packet,dtype='uint8') + for i in range(n_packet): + packet[i] = real_part[i] | imag_part[i] + + # if x16 + if (x16): + + p2 = np.zeros(21*n_packet,dtype='uint8') + for i in range(21): + p2[i*n_packet:(i+1)*n_packet] = packet + + out_str = p2.tobytes() + + else: + + out_str = packet.tobytes() + +# if noise +if noise is True: + + # defaults + rms = 1.5 # 4-bit + erms = rms*16. + + # make real and imag parts + real_part = np.zeros(n_packet,dtype='int8') + imag_part = np.zeros(n_packet,dtype='int8') + + for ant in [0, 1, 2]: + for i in np.arange(384): + + # time 1 pol A + j = int(1536*ant + i*4) + real_part[j] = round(np.random.normal()*erms) + imag_part[j] = round(np.random.normal()*erms) + + # time 1 pol B + j = int(1536*ant + i*4 + 1) + real_part[j] = round(np.random.normal()*erms) + imag_part[j] = round(np.random.normal()*erms) + + # time 2 pol A + j = int(1536*ant + i*4 + 2) + real_part[j] = round(np.random.normal()*erms) + imag_part[j] = round(np.random.normal()*erms) + + # time 2 pol B + j = int(1536*ant + i*4 + 3) + real_part[j] = round(np.random.normal()*erms) + imag_part[j] = round(np.random.normal()*erms) + + # make 4-bit versions + real_part = np.cast['uint8'](real_part) + imag_part = np.cast['uint8'](imag_part) + for i in range(n_packet): + real_part[i] = real_part[i] >> 4 + imag_part[i] = (imag_part[i] >> 4) << 4 + + # finish packet + packet = np.zeros(n_packet,dtype='uint8') + for i in range(n_packet): + packet[i] = real_part[i] | imag_part[i] + + out_str = packet.tobytes() + + +newFile = open(outfile, "wb") +newFile.write(out_str) +newFile.close() + + +#plot_spectrum(out_str,pol=1,ant=1) + + + + + + + + + + + + diff --git a/utils/gen_testblock.py b/utils/gen_testblock.py new file mode 100644 index 0000000..b9a3c9e --- /dev/null +++ b/utils/gen_testblock.py @@ -0,0 +1,49 @@ +import numpy as np, struct +import matplotlib.pyplot as plt +import os + + +''' The aim here is to make data blocks to test the bfCorr code. + +Structure of a packet is 3 ant, 384 chan, 2 time, 2 pol, r/i +4608 bytes long + +Structure of a block is [2048 packets, 32 channel groups, ...] + +We want the real and imagniary parts to be random integers over +the range of [-8, 7] +''' + +# defaults +outfile = 'block.out' +if os.path.exists(outfile): + os.remove(outfile) + + +num_packets = 4 +n_antennae = 3 +n_chans = 384 +n_changs = 32 + +# make values in the range vals = [-8, 7] +# [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] + + +for ipacket in np.arange(num_packets): + + print(ipacket) + for ichang in np.arange(n_changs): + + packet = np.zeros(num_packets*n_changs, dtype='uint8') + for i in np.arange(n_antennae): + for j in np.arange(n_chans): + for k in np.arange(num_packets): + + # we now make a randon integer iunt8 format + idx = ichang + n_changs*ipacket + packet[idx] = np.random.randint(0, 256) + + out_str = packet.tobytes() + newFile = open(outfile, "ab") + newFile.write(out_str) + newFile.close() diff --git a/utils/get_rms.py b/utils/get_rms.py new file mode 100644 index 0000000..8854a36 --- /dev/null +++ b/utils/get_rms.py @@ -0,0 +1,141 @@ +import numpy as np +import sockets as s +import struct +import sys +import matplotlib.pyplot as plt + +# for file writing + +def write_bin(data,fl='test.dat'): + + f = open(fl,'w+b') + for packet in data: + d = bytearray(np.asarray(struct.unpack('>4616B',packet))[8:].astype(np.int8)) + print(len(d)) + f.write(d) + + f.close() + + +# for making histogram of input + +def make_histogram(data,ant=0,pol=0): + + histo = np.zeros(16) + rms = 0. + + for packet in data: + + d = np.asarray(struct.unpack('>4616B',packet))[8:] + + # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped + d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() + + d_r = ((d & 15) << 4) + d_i = d & 240 + d_r = d_r.astype(np.int8)/16 + d_i = d_i.astype(np.int8)/16 + + rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.) + + for i in range(384*2): + + histo[int(d_r[i])+8] += 1. + histo[int(d_i[i])+8] += 1. + + return histo/np.max(histo),np.sqrt(rms) + +# for making spectrum from data +def decode_data(data,ant=0,pol=0): + + spec = np.zeros(384*2) + + for packet in data: + + d = np.asarray(struct.unpack('>4616B',packet))[8:] + + # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped + d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() + + d_r = ((d & 15) << 4) + d_i = d & 240 + d_r = d_r.astype(np.int8)/16 + d_i = d_i.astype(np.int8)/16 + + spec += d_r**2.+d_i**2. + + spec = spec.reshape((384,2)).mean(axis=1) + return(spec) + +# for decoding packets +def decode_header(data): + + min_s = 10000 + max_s = 0 + + for packet in data: + + d = np.asarray(struct.unpack('>4616B',packet)) + + # packet id + p = 0 + p = p | ((d[4] & 224) >> 5) + p = p | (d[3] << 3) + p = p | (d[2] << 11) + p = p | (d[1] << 19) + p = p | (d[0] << 27) + + # spectrum id + sp = 0 + sp = sp | ((d[4] & 31) << 8) + sp = sp | d[5] + + if (spmax_s): + max_s = sp + + print(p,sp) + + print(min_s,max_s) + +# MAIN + +n = 10000 +ip = '10.41.0.62' +port=4011 +data = s.capture(ip=ip,port=port,n=n) +ant=0 +pol=0 + +#decode_header(data) + +histo,rms = make_histogram(data,ant=ant,pol=pol) +print() +print('RMS:',rms/np.sqrt(1.*n)) +for i in np.arange(16): + print(histo[i],' ',) + +sys.exit() + +spec = decode_data(data,ant=ant,pol=pol) +spec = np.sqrt(spec/n/2.) +print() +print('Have spectral points',len(spec)) +print() +#for i in np.arange(len(spec)): +# print(spec[i],' ',) + +plt.plot(spec) +plt.show() + + + + + + + + + + + diff --git a/utils/get_rms_packet.py b/utils/get_rms_packet.py new file mode 100644 index 0000000..f75d278 --- /dev/null +++ b/utils/get_rms_packet.py @@ -0,0 +1,36 @@ +import socket, numpy as np +from progress.bar import Bar +import sockets as s +import struct +import sys +import matplotlib.pyplot as plt + +# ip as string, port as int, buf as int +def capture(n=100,ip=None,port=None,buf=4616): + + if ip is None: + print('No IP') + return() + + if port is None: + print('No port') + return() + + sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM) + sock.bind((ip,port)) + + captured=0 + packs = [] + bar = Bar('Capturing '+str(n)+' packets...', max=n) + while captured Date: Tue, 25 Jun 2024 21:45:10 -0700 Subject: [PATCH 20/30] Major code refactorisation --- CMakeLists.txt | 15 +- include/CMakeLists.txt | 4 +- include/dsaX.h | 105 +++--- include/dsaX_blas_interface.h | 4 +- include/dsaX_cublas_interface.h | 3 +- include/dsaX_cuda_interface.h | 33 +- include/dsaX_cuda_kernels.h | 260 ++++++++++++++ include/dsaX_cutlass_interface.h | 6 +- include/dsaX_enums.h | 30 +- include/dsaX_ftd.h | 5 + include/dsaX_interface.h | 12 + include/dsaX_magma_interface.h | 4 + include/dsaX_utils.h | 6 +- src/11_planar_complex_array.cu | 1 - src/CMakeLists.txt | 72 +++- src/dsaX_beamformer.cpp | 120 +++++++ src/dsaX_beamformer.cu | 168 --------- src/dsaX_blas_interface.cpp | 28 ++ src/dsaX_blas_interface.cu | 11 - src/dsaX_correlator.cpp | 18 +- src/dsaX_cublas_interface.cu | 60 ++-- src/dsaX_cuda_interface.cu | 318 ++++------------ src/dsaX_interface.cpp | 69 ++++ src/dsaX_magma_interface.cu | 23 ++ src/dsaX_psrdada_utils.cpp | 11 + src/dsaX_utils.cpp | 32 +- src/version.cpp | 5 + tests/CMakeLists.txt | 5 +- tests/CMakeLists.txt~ | 5 - tests/command_line_params.cpp | 17 + tests/dsaX_beamformer_correlator_test.cpp~ | 398 --------------------- tests/dsaX_correlator_test.cpp | 58 ++- 32 files changed, 889 insertions(+), 1017 deletions(-) create mode 100644 include/dsaX_cuda_kernels.h create mode 100644 include/dsaX_ftd.h create mode 100644 include/dsaX_interface.h create mode 100644 include/dsaX_magma_interface.h create mode 100644 src/dsaX_beamformer.cpp delete mode 100644 src/dsaX_beamformer.cu create mode 100644 src/dsaX_blas_interface.cpp delete mode 100644 src/dsaX_blas_interface.cu create mode 100644 src/dsaX_interface.cpp create mode 100644 src/dsaX_magma_interface.cu create mode 100644 src/dsaX_psrdada_utils.cpp create mode 100644 src/version.cpp delete mode 100644 tests/CMakeLists.txt~ create mode 100644 tests/command_line_params.cpp delete mode 100644 tests/dsaX_beamformer_correlator_test.cpp~ diff --git a/CMakeLists.txt b/CMakeLists.txt index 441ae7f..acfd1a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CXX_EXTENSIONS ON) # Define the project -project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C) +project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES C CXX) # For GCC 8 and lower, set -pthread flag manually set(CMAKE_C_FLAGS "-pthread") @@ -80,6 +80,12 @@ if(GIT_FOUND) endif() endif(GIT_FOUND) + +option(DSA_XENGINE_BUILD_ALL_TESTS "build tests by default" ON) +option(DSA_XENGINE_INSTALL_ALL_TESTS "install tests by default" ON) +option(DSA_XENGINE_BUILD_SHAREDLIB "build dsaXengine as a shared lib" ON) + + # Use ExternalProject_Add for libtcc (borks with FetchContent) # Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch) include(ExternalProject) @@ -92,7 +98,7 @@ include(FetchContent) if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) # CUDA specific part of CMakeLists - #set(CMAKE_CUDA_EXTENSIONS OFF) + enable_language(CUDA) find_package(CUDAToolkit REQUIRED) # Get GPU architecture from environmen, or set default (sm_80) @@ -130,6 +136,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) # Get TCC dependency option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF) if(DSA_XENGINE_ENABLE_TCC) + add_compile_definitions(DSA_XENGINE_ENABLE_TCC) option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF) if(DSA_XENGINE_DOWNLOAD_TCC) ExternalProject_Add(TCC @@ -145,6 +152,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) # Get CUTLASS dependency option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF) if(DSA_XENGINE_ENABLE_CUTLASS) + add_compile_definitions(DSA_XENGINE_ENABLE_CUTLASS) option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF) if(DSA_XENGINE_DOWNLOAD_CUTLASS) # Custom CUTLASS build @@ -164,6 +172,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) # Get MAGMA dependency option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF) if(DSA_XENGINE_ENABLE_MAGMA) + add_compile_definitions(DSA_XENGINE_ENABLE_MAGMA) option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF) if(DSA_XENGINE_DOWNLOAD_MAGMA) # Custom MAGMA build @@ -182,6 +191,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) # Get XGPU dependency (fix install) option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF) if(DSA_XENGINE_ENABLE_XGPU) + add_compile_definitions(DSA_XENGINE_ENABLE_XGPU) option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF) if(DSA_XENGINE_DOWNLOAD_XGPU) # Download, build and install @@ -203,6 +213,7 @@ endif() # CUDA functionality # Get OPENBLAS dependency option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF) if(DSA_XENGINE_ENABLE_OPENBLAS) + add_compile_definitions(DSA_XENGINE_ENABLE_OPENBLAS) option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF) if(DSA_XENGINE_DOWNLOAD_OPENBLAS) # Custom OPENBLAS build diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index a056a0f..e8ec2d6 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -4,12 +4,14 @@ enable_language(CUDA) #------------------------------ set(DSA_XENGINE_HEADERS # cmake-format: sortable + dsaX.h + dsaX_def.h + dsaX_ftd.h dsaX_cuda_interface.h dsaX_cuda_headers.h dsaX_capture.h dsaX_capture_manythread.h dsaX_capture_pcap.h - dsaX_def.h dsaX_cutlass_interface.h ) install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) diff --git a/include/dsaX.h b/include/dsaX.h index 7cf23dc..699fe37 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -2,70 +2,34 @@ #include +#include "dsaX_def.h" #include "dsaX_enums.h" -#include "dsaX_cuda_headers.h" - -// required to prevent overflow in corr matrix multiply -#define halfFac 4 - -// beam sep -#define sep 1.0 // arcmin - -// define structure that carries around device memory -typedef struct dmem { - - // initial data and streams - char * h_input; // host input pointer - char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - // correlator pointers - // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times] - half * d_r, * d_i; - // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] - half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi; - // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] - float * d_output; - - // beamformer pointers - char * d_big_input; - half * d_br, * d_bi; - half * weights_r, * weights_i; //weights: [arm, tactp, b] - half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b] - unsigned char * d_bigpower; //output: [b, tc] - float * d_scf; // scale factor per beam - float * d_chscf; - float * h_winp; - int * flagants, nflags; - float * h_freqs, * d_freqs; - - // timing - float cp, prep, cubl, outp; - -} dmem; // Structure that carries BLAS parameters typedef struct dsaXBLASParam_s { size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ dsaXBLASType blas_type; /**< Type of BLAS computation to perfrom */ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ // GEMM params - dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ - dsaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ int m; /**< number of rows of matrix op(A) and C. */ int n; /**< number of columns of matrix op(B) and C. */ int k; /**< number of columns of op(A) and rows of op(B). */ int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ - int a_offset; /**< position of the A array from which begin read/write. */ - int b_offset; /**< position of the B array from which begin read/write. */ - int c_offset; /**< position of the C array from which begin read/write. */ - int a_stride; /**< stride of the A array in strided(batched) mode */ - int b_stride; /**< stride of the B array in strided(batched) mode */ - int c_stride; /**< stride of the C array in strided(batched) mode */ - std::complex alpha; /**< scalar used for multiplication. */ - std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + long long int a_offset; /**< position of the A array from which begin read/write. */ + long long int b_offset; /**< position of the B array from which begin read/write. */ + long long int c_offset; /**< position of the C array from which begin read/write. */ + long long int a_stride; /**< stride of the A array in strided(batched) mode */ + long long int b_stride; /**< stride of the B array in strided(batched) mode */ + long long int c_stride; /**< stride of the C array in strided(batched) mode */ + std::complex alpha; /**< scalar used for multiplication. */ + std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ // Common params int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ @@ -73,3 +37,46 @@ typedef struct dsaXBLASParam_s { dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ } dsaXBLASParam; + +// required to prevent overflow in corr matrix multiply +#define halfFac 4 + +// beam sep +#define sep 1.0 // arcmin + +// define structure that carries around device memory pointers +typedef struct dmem { + + // initial data and streams + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + + // correlator pointers + // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times] + void *d_r, *d_i; //half + // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] + void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half + // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] + float *d_output; + + // beamformer pointers + char *d_big_input; + void *d_br, *d_bi; //half + void *weights_r, *weights_i; //weights: [arm, tactp, b] //half + void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half + unsigned char *d_bigpower; //output: [b, tc] + float *d_scf; // scale factor per beam + float *d_chscf; + float *h_winp; + int *flagants, nflags; + float *h_freqs, *d_freqs; + + // timing + float cp, prep, cubl, outp; + +} dmem; + +void dsaXCorrelator(void *output_data, void *input_data); + +void reorderOutput(dmem *d); +void reorderInput(dmem *d); diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h index 3cf5c4a..49564b5 100644 --- a/include/dsaX_blas_interface.h +++ b/include/dsaX_blas_interface.h @@ -1,5 +1,5 @@ #pragma once -#include "dsaX.h" +#include "dsaX_interface.h" -void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param); +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h index 7ad8b31..5aea5ef 100644 --- a/include/dsaX_cublas_interface.h +++ b/include/dsaX_cublas_interface.h @@ -1,5 +1,4 @@ #pragma once #include "dsaX.h" -#include "dsaX_cuda_headers.h" -void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam param); +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h index c8ea8aa..cee1581 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/dsaX_cuda_interface.h @@ -1,32 +1,35 @@ #pragma once +#include + #include "dsaX_def.h" +#include "dsaX_enums.h" #include "dsaX.h" -#ifdef DSA_XENGINE_TARGET_CUDA -void initialize_device_memory(dmem *d, int bf); -void deallocate_device_memory(dmem *d, int bf); -void reorder_output_device(dmem *d); +void initializeCudaMemory(dmem *d, int bf); + +void deallocateCudaMemory(dmem *d, int bf); + +void dsaXmemsetCuda(void *array, int ch, size_t n); -__global__ void corr_input_copy(char *input, half *inr, half *ini); +void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind); -template __global__ void transpose_matrix(in_prec *idata, out_prec *odata); +void dsaXDeviceSynchronizeCuda(); -void reorder_input_device(char *input, char *tx, half *inr, half *ini); +void reorderOutputCuda(dmem *d); -__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup); +void calcWeightsCuda(dmem *d); -__global__ void transpose_input_bf(double *idata, double *odata); +void reorderInputCuda(dmem *d); -__global__ void populate_weights_matrix(float *antpos_e, float *antpos_n, float *calibs, half *wr, half *wi, float *fqs); +template void transposeMatrixCuda(in_prec *idata, out_prec *odata); -void calc_weights(dmem *d); +void transposeInputBeamformerCuda(double *idata, double *odata, std::vector &dim_block_in, std::vector &dim_grid_in); -__global__ void fluff_input_bf(char *input, half *dr, half *di); +void transposeScaleBeamformerCuda(void *real, void *imag, unsigned char *output, std::vector &dim_block_in, std::vector &dim_grid_in); -__global__ void transpose_scale_bf(half *ir, half *ii, unsigned char *odata); +void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb); -__global__ void sum_beam(unsigned char *input, float *output); -#endif +void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb); diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h new file mode 100644 index 0000000..db09baa --- /dev/null +++ b/include/dsaX_cuda_kernels.h @@ -0,0 +1,260 @@ +#pragma once + +#include "dsaX_cuda_headers.h" + +// KERNELS +// DMH: Abstract hardcoded launch parameters +__global__ void transpose_input_beamformer(double *idata, double *odata) { + + __shared__ double tile[16][17][4]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + + for (int j = 0; j < 16; j += 8) { + tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)]; + tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1]; + tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2]; + tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3]; + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) { + odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0]; + odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1]; + odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2]; + odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3]; + } +} + +// kernel to help with reordering output +// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] +// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads +__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { + + int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); + int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); + int ch = (int)(chpol / 2); + int base_idx = indices_lookup[baseline]; + int iidx = base_idx * NCHAN_PER_PACKET + ch; + int pol = (int)(chpol % 2); + + float v1=0., v2=0.; + + // Use CUDA casting intrinsic __half2float + for (int i=0;i __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { + + __shared__ in_prec tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + +} + +// kernel to fluff input +// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks +__global__ void corr_input_copy(char *input, half *inr, half *ini) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 + int tidx = threadIdx.x; // assume 128 threads per block + int iidx = bidx*128+tidx; + + // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr + // to get real part 4 bit data. + // 0000rrrr + // Bit shift this result by 4 to the left. + // rrrr0000 + // Cast to signed char. + // +-rrr0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000rrr + // Cast to float and use CUDA intrinsic to cast to signed half + inr[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(15) ) << 4) >> 4)); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); + + // Both results should be half (FP16) integers between -8 and 7. + half re = inr[iidx]; + half im = ini[iidx]; + half lim = 2.; + if( (re > lim || re < -lim) || (im > lim || im < -lim)) { + //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); + } +} + +// kernel to populate an instance of weights matrix +// [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] +// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads +__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { + + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int inidx = bidx*128+tidx; + + // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) + + // get indices + int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); + int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2))); + int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); + int bm = (int)(idx / (128*(NANTS/2))); + int tactp = (int)(idx % (128*(NANTS/2))); + //int t = (int)(tactp / (32*(NANTS/2))); + int actp = (int)(tactp % (32*(NANTS/2))); + int a = (int)(actp / 32); + int ctp = (int)(actp % 32); + //int c = (int)(ctp / 4); + int tp = (int)(ctp % 4); + //int t2 = (int)(tp / 2); + int pol = (int)(tp % 2); + int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; + + // calculate weights + float theta, afac, twr, twi; + if (iArm==0) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_e[a+48*iArm]); + twi = sin(afac*antpos_e[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } + if (iArm==1) { + theta = sep*(127.-bm*1.)*PI/10800.; // radians + afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate + twr = cos(afac*antpos_n[a+48*iArm]); + twi = sin(afac*antpos_n[a+48*iArm]); + wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); + wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); + //wr[inidx] = __float2half(calibs[widx]); + //wi[inidx] = __float2half(calibs[widx+1]); + } +} + +// kernel to fluff input bf data +// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads +__global__ void fluff_input_beamformer(char * input, half * dr, half * di) { + + int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 + int tidx = threadIdx.x; // assume 128 + int idx = bidx*128+tidx; + + dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); + di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + + // Both results should be half (FP16) integers between -8 and 7. + //half re = dr[idx]; + //half im = di[idx]; + //half lim = 0; + //if( (re > lim || re < -lim) || (im > lim || im < -lim)) { + //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); + //} +} + +// transpose, add and scale kernel for bf +// assume breakdown into tiles of 16x16, and run with 16x8 threads per block +// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) +// scf is a per-beam scale factor to enable recasting as unsigned char +__global__ void transpose_scale_beamformer(half * ir, half * ii, unsigned char * odata) { + + __shared__ float tile[16][17]; + + int x = blockIdx.x * 16 + threadIdx.x; + int y = blockIdx.y * 16 + threadIdx.y; + int width = gridDim.x * 16; + float dr, di; + + for (int j = 0; j < 16; j += 8) { + dr = (float)(ir[(y+j)*width + x]); + di = (float)(ii[(y+j)*width + x]); + tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); + } + + __syncthreads(); + + x = blockIdx.y * 16 + threadIdx.x; // transpose block offset + y = blockIdx.x * 16 + threadIdx.y; + width = gridDim.y * 16; + + for (int j = 0; j < 16; j += 8) + odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + +} + +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +__global__ void sum_beam(unsigned char *input, float *output) { + + __shared__ float summ[512]; + int bidx = blockIdx.x; + int tidx = threadIdx.x; + //int idx = bidx*256+tidx; + int bm = (int)(bidx/48); + int ch = (int)(bidx % 48); + + summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); + + __syncthreads(); + + if (tidx<256) { + summ[tidx] += summ[tidx+256]; + summ[tidx] += summ[tidx+128]; + summ[tidx] += summ[tidx+64]; + summ[tidx] += summ[tidx+32]; + summ[tidx] += summ[tidx+16]; + summ[tidx] += summ[tidx+8]; + summ[tidx] += summ[tidx+4]; + summ[tidx] += summ[tidx+2]; + summ[tidx] += summ[tidx+1]; + } + + if (tidx==0) output[bidx] = summ[tidx]; +} diff --git a/include/dsaX_cutlass_interface.h b/include/dsaX_cutlass_interface.h index 5aa753e..f95eeaa 100644 --- a/include/dsaX_cutlass_interface.h +++ b/include/dsaX_cutlass_interface.h @@ -48,11 +48,11 @@ struct Options { Options(): help(false), problem_size({1024, 1024, 1024}), - batch_count(1), + batch_count(256), reference_check(false), - iterations(20), + iterations(2), alpha(1), - beta() { } + beta(0) { } // Parses the command line void parse(int argc, char const **args) { diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h index b188019..30fe3c6 100644 --- a/include/dsaX_enums.h +++ b/include/dsaX_enums.h @@ -2,21 +2,35 @@ #define DSA_INVALID_ENUM (-0x7fffffff - 1) -typedef enum dsaError_t { DSA_SUCCESS = 0, DSA_ERROR = 1, DSA_ERROR_UNINITIALIZED = 2 } dsaError_t; +typedef enum dsaXError_t { + DSA_SUCCESS = 0, + DSA_ERROR = 1, + DSA_ERROR_UNINITIALIZED = 2, + DSA_ERROR_INVALID = DSA_INVALID_ENUM +} dsaXError; -typedef enum dsaBLASOperation_s { +typedef enum dsaXBLASOperation_s { DSA_BLAS_OP_N = 0, // No transpose DSA_BLAS_OP_T = 1, // Transpose only DSA_BLAS_OP_C = 2, // Conjugate transpose DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM -} dsaBLASOperation; +} dsaXBLASOperation; typedef enum dsaXBLASType_s { DSA_BLAS_GEMM = 0, DSA_BLAS_INVALID = DSA_INVALID_ENUM } dsaXBLASType; -typedef enum dsaXBLASDataType_s { +typedef enum dsaXBLASLib_s { + DSA_BLAS_LIB_CUBLAS = 0, + DSA_BLAS_LIB_MAGMA = 1, + DSA_BLAS_LIB_CUTLASS = 2, + DSA_BLAS_LIB_TCC = 3, + DSA_BLAS_LIB_OPENBLAS = 4, + DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM +} dsaXBLASLib; + +typedef enum dsaXBLASDataLib_s { DSA_BLAS_DATATYPE_H = 0, // Half DSA_BLAS_DATATYPE_S = 1, // Single DSA_BLAS_DATATYPE_D = 2, // Double @@ -31,3 +45,11 @@ typedef enum dsaXBLASDataOrder_s { DSA_BLAS_DATAORDER_COL = 1, DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM } dsaXBLASDataOrder; + +typedef enum dsaXMemcpyKind_s { + dsaXMemcpyHostToHost = 0, + dsaXMemcpyHostToDevice = 1, + dsaXMemcpyDeviceToHost = 2, + dsaXMemcpyDeviceToDevice = 3, + dsaXMemcpyInvalid = DSA_INVALID_ENUM +} dsaXMemcpyKind; diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h new file mode 100644 index 0000000..f7363f1 --- /dev/null +++ b/include/dsaX_ftd.h @@ -0,0 +1,5 @@ +#pragma once + +#include "dsaX.h" + +void dcorrelator(dmem *d); diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h new file mode 100644 index 0000000..06a2364 --- /dev/null +++ b/include/dsaX_interface.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +// DMH: decorate these with Doxygen +void dsaXCorrelator(void *input_data, void *output_data); +void reorderInput(dmem *d); +void reorderOutput(dmem *d); +void transposeInputBeamformer(double *input, double *output, std::vector &dimBlock, std::vector &dimGrid); +void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector &dimBlock, std::vector &dimGrid); +void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb); +void sumBeam(unsigned char *input, float *output, int blocks, int tpb); diff --git a/include/dsaX_magma_interface.h b/include/dsaX_magma_interface.h new file mode 100644 index 0000000..12f0cc7 --- /dev/null +++ b/include/dsaX_magma_interface.h @@ -0,0 +1,4 @@ +#pragma once +#include "dsaX.h" + +void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h index 5d39861..f2dbc0c 100644 --- a/include/dsaX_utils.h +++ b/include/dsaX_utils.h @@ -3,7 +3,5 @@ #include "dsaX.h" void dsaXmemset(void *array, int ch, size_t n); - -void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n); -void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n); -void dsaXmemcpyDeviceToDevice(void *array_device_to, void *array_device_from, size_t n); +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind); +void dsaXDeviceSynchronize(); diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu index ba94b60..94dcc55 100644 --- a/src/11_planar_complex_array.cu +++ b/src/11_planar_complex_array.cu @@ -302,7 +302,6 @@ public: typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0); - int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k(); int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n(); int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c73743a..aaacfa5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,47 +1,87 @@ -enable_language(CUDA) +include_directories(${CMAKE_SOURCE_DIR}/include) -include_directories(../include) +if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) + add_compile_definitions(DSA_XENGINE_TARGET_CUDA) +endif() + +if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU) + add_compile_definitions(DSA_XENGINE_TARGET_CPU) +endif() # DSA Fast Time Domain library #----------------------------- -add_library(dsa SHARED +set(DSAX_OBJS dsaX_cuda_interface.cu dsaX_cublas_interface.cu - dsaX_blas_interface.cu - dsaX_beamformer.cu + dsaX_magma_interface.cu + dsaX_blas_interface.cpp + dsaX_beamformer.cpp dsaX_correlator.cpp + dsaX_interface.cpp dsaX_utils.cpp dsaX_psrdada_utils.cpp ) -if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA) - add_compile_definitions(DSA_XENGINE_TARGET_CUDA) +# split source into cu and cpp files +foreach(item ${DSAX_OBJS}) + string(REGEX MATCH ".+\\.cu$" item_match ${item}) + if(item_match) + list(APPEND DSAX_CU_OBJS ${item}) + endif(item_match) +endforeach(item ${DSAX_OBJS}) + +list(REMOVE_ITEM DSAX_OBJS ${DSAX_CU_OBJS}) + +# DSAX_CU_OBJS should contain all cuda files now and DSAX_OBJS all cpp. +# If we have a git version, make version.cpp depend on git head so that it is +# rebuilt if the git sha changed +if(GITVERSION) + find_path( + DSAX_GITDIR NAME HEAD + PATHS ${CMAKE_SOURCE_DIR}/.git/logs + NO_DEFAULT_PATH) + include(AddFileDependencies) + if(DSAX_GITDIR) + add_file_dependencies(version.cpp ${DSAX_GITDIR}/HEAD) + endif() endif() -if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU) - add_compile_definitions(DSA_XENGINE_TARGET_CPU) +mark_as_advanced(DSAX_GITDIR) + +# generate a cmake object library for all cpp files first +add_library(dsax_cpp OBJECT ${DSAX_OBJS}) + +if(DSA_XENGINE_BUILD_SHAREDLIB) + set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + add_library(dsax SHARED) +else() + add_library(dsax STATIC) endif() +add_library(DSA_XENGINE::dsax ALIAS dsax) + +# make one library +target_sources(dsax PRIVATE $ ${DSAX_CU_OBJS}) if(CUDAToolkit_FOUND) - target_link_libraries(dsa PUBLIC CUDA::cudart) + target_link_libraries(dsax INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY}) endif() if(DSA_XENGINE_ENABLE_PSRDADA) include_directories(${PSRDada_SOURCE_DIR}/src) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) - target_link_libraries(dsa PUBLIC ${PSRDada_LIB}) + target_link_libraries(dsax PUBLIC ${PSRDada_LIB}) endif() if(DSA_XENGINE_ENABLE_XGPU) include_directories(${xGPU_SOURCE_DIR}/src) set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) - target_link_libraries(dsa PUBLIC ${XGPU_LIB}) + target_link_libraries(dsax PUBLIC ${XGPU_LIB}) endif() if(DSA_XENGINE_ENABLE_CUTLASS) include_directories(${NvidiaCutlass_DIR}/../../../include) include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) - target_link_libraries(dsa PUBLIC ${NvidiaCutlass_LIB}) + target_link_libraries(dsax PUBLIC ${NvidiaCutlass_LIB}) # Some simple CUTLASS examples to test linking/benching #------------------------------------------------------ @@ -67,7 +107,7 @@ endif() #----------------------------- install(TARGETS # cmake-format: sortable - dsa + dsax LIBRARY DESTINATION lib ) @@ -84,6 +124,6 @@ install(TARGETS #----------------------------- if(CUDAToolkit_FOUND) - add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu) - target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsa ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) + #add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu) + #target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsax ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) endif() diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp new file mode 100644 index 0000000..f395b0e --- /dev/null +++ b/src/dsaX_beamformer.cpp @@ -0,0 +1,120 @@ +// -*- c++ -*- +/* assumes input and output block size is appropriate - will seg fault otherwise*/ +/* +Workflow is similar for BF and corr applications + - copy data to GPU, convert to half-precision and calibrate while reordering + - do matrix operations to populate large output vector + */ + +#include +#include + +#include "dsaX_def.h" +#include "dsaX.h" +#include "dsaX_blas_interface.h" +#include "dsaX_utils.h" +#include "dsaX_psrdada_utils.h" + +using namespace std; + +/* +Beamformer: + - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] +(single transpose operation) + - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 + - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) + - transpose and done! + +*/ +// beamformer function +void dbeamformer(dmem *d) { + + dsaXBLASParam blas_param; + blas_param.trans_a = DSA_BLAS_OP_T; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NPACKETS_PER_BLOCK/4; + blas_param.n = NBEAMS/2; + blas_param.k = 4*(NANTS/2)*8*2*2; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.k; + blas_param.ldb = blas_param.k; + blas_param.beta = 0.0; + blas_param.ldc = blas_param.m; + blas_param.a_stride = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; + blas_param.b_stride = (NBEAMS/2)*4*(NANTS/2)*8*2*2; + blas_param.c_stride = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; + blas_param.batch_count = NCHAN_PER_PACKET/8; + + long long int i1, i2; + + // timing + // copy, prepare, cublas, output + clock_t begin, end; + + // do big memcpy + begin = clock(); + dsaXmemcpy(d->d_big_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4, dsaXMemcpyHostToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // loop over halves of the array + for (int iArm=0;iArm<2;iArm++) { + + // zero out output arrays + dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short)); + dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short)); + dsaXDeviceSynchronize(); + + // copy data to device + // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + // final data: need to split by NANTS. + begin = clock(); + for (i1=0; i1d_input + i1*(NANTS/2)*NCHAN_PER_PACKET*4, + d->d_big_input + i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4, + (NANTS/2)*NCHAN_PER_PACKET*4, dsaXMemcpyDeviceToDevice); + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + + // do reorder and fluff of data to real and imag + begin = clock(); + + // DMH: Abstract the launch parameters + std::vector dimBlock = {16, 8}; + std::vector dimGrid = {NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16}; + transposeInputBeamformer((double *)(d->d_input), (double *)(d->d_tx), dimBlock, dimGrid); + + int blocks = NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128; + int tpb = 128; + fluffInputBeamformer(d->d_tx, d->d_br, d->d_bi, blocks, tpb); + end = clock(); + d->prep += (float)(end - begin) / CLOCKS_PER_SEC; + + // set up for gemm + i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset + blas_param.b_offset = i2; + // large matrix multiply to get real and imag outputs + begin = clock(); + dsaXHgemmStridedBatched(d->d_br, d->d_bi, d->weights_r, d->weights_i, d->d_bigbeam_r, d->d_bigbeam_i, blas_param); + end = clock(); + d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; + + // simple formation of total power and scaling to 8-bit in transpose kernel + // Reuse dimBlock + //DMH: Abstract kernel launch parameters + dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16; + dimGrid[0] = (NCHAN_PER_PACKET/8)/16; + begin = clock(); + transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid); + end = clock(); + d->outp += (float)(end - begin) / CLOCKS_PER_SEC; + } + + // form sum over times + int blocks = 24576; + int tpb = 512; + // COMMENT OUT WHEN DONE!!! + //sumBeam(d->d_bigpower, d->d_chscf, blocks, tpb); +} diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu deleted file mode 100644 index 0d7b1df..0000000 --- a/src/dsaX_beamformer.cu +++ /dev/null @@ -1,168 +0,0 @@ -// -*- c++ -*- -/* assumes input and output block size is appropriate - will seg fault otherwise*/ -/* -Workflow is similar for BF and corr applications - - copy data to GPU, convert to half-precision and calibrate while reordering - - do matrix operations to populate large output vector - */ - -#include - -#include "dsaX_def.h" -#include "dsaX.h" -#include "dsaX_blas_interface.h" -#include "dsaX_utils.h" -#include "dsaX_psrdada_utils.h" -#ifdef DSA_XENGINE_TARGET_CUDA -#include "dsaX_cuda_interface.h" -#endif - -using namespace std; - -int DEBUG = 1; - -void usage() { - fprintf (stdout, - "dsaX_beamformer_correlator [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -b run beamformer [default is to run correlator]\n" - " -h print usage\n" - " -t binary file for test mode\n" - " -f flagants file\n" - " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); -} - - -/* -Beamformer: - - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex] -(single transpose operation) - - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2 - - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major) - - transpose and done! - -*/ -// beamformer function -void dbeamformer(dmem *d) { - - // gemm settings - recall column major order assumed - // stride over 48 chans - cublasHandle_t cublasH = NULL; - cublasCreate(&cublasH); - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - const int m = NPACKETS_PER_BLOCK/4; - const int n = NBEAMS/2; - const int k = 4*(NANTS/2)*8*2*2; - const half alpha = 1.; - const half malpha = -1.; - const int lda = k; - const int ldb = k; - const half beta0 = 0.; - const half beta1 = 1.; - const int ldc = m; - const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2; - const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2; - const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2; - const int batchCount = NCHAN_PER_PACKET/8; - long long int i1, i2;//, o1; - - // create streams - cudaStream_t stream; - cudaStreamCreate(&stream); - - // timing - // copy, prepare, cublas, output - clock_t begin, end; - - // do big memcpy - begin = clock(); - dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4); - end = clock(); - d->cp += (float)(end - begin) / CLOCKS_PER_SEC; - - // loop over halves of the array - for (int iArm=0;iArm<2;iArm++) { - - // zero out output arrays - dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); - dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half)); - cudaDeviceSynchronize(); - - // copy data to device - // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - // final data: need to split by NANTS. - begin = clock(); - for (i1=0; i1d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4, - d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4, - (NANTS/2)*NCHAN_PER_PACKET*4); - end = clock(); - d->cp += (float)(end - begin) / CLOCKS_PER_SEC; - - // do reorder and fluff of data to real and imag - begin = clock(); - - dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16); - transpose_input_bf<<< dimGrid1, dimBlock1 >>>((double *)(d->d_input), (double *)(d->d_tx)); - fluff_input_bf<<>>(d->d_tx, d->d_br, d->d_bi); - - end = clock(); - d->prep += (float)(end - begin) / CLOCKS_PER_SEC; - - // large matrix multiply to get real and imag outputs - // set up for gemm - cublasSetStream(cublasH, stream); - i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset - - // run strided batched gemm - begin = clock(); - // ac - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_br,lda,strideA, - d->weights_r+i2,ldb,strideB,&beta0, - d->d_bigbeam_r,ldc,strideC, - batchCount); - // -bd - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &malpha,d->d_bi,lda,strideA, - d->weights_i+i2,ldb,strideB,&beta1, - d->d_bigbeam_r,ldc,strideC, - batchCount); - // bc - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_bi,lda,strideA, - d->weights_r+i2,ldb,strideB,&beta0, - d->d_bigbeam_i,ldc,strideC, - batchCount); - // ad - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,d->d_br,lda,strideA, - d->weights_i+i2,ldb,strideB,&beta1, - d->d_bigbeam_i,ldc,strideC, - batchCount); - - cudaDeviceSynchronize(); - end = clock(); - d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; - - // simple formation of total power and scaling to 8-bit in transpose kernel - begin = clock(); - dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16); - transpose_scale_bf<<>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - end = clock(); - d->outp += (float)(end - begin) / CLOCKS_PER_SEC; - } - - cudaStreamDestroy(stream); - cublasDestroy(cublasH); - - // form sum over times - //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf); -} diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp new file mode 100644 index 0000000..e370e87 --- /dev/null +++ b/src/dsaX_blas_interface.cpp @@ -0,0 +1,28 @@ +#include + +#include "dsaX.h" +#include "dsaX_cublas_interface.h" +#include "dsaX_magma_interface.h" + +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param) { + switch (param.blas_lib) { + case DSA_BLAS_LIB_CUBLAS: + dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_MAGMA: + dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_CUTLASS: + //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_OPENBLAS: + //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_TCC: + //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + default: + std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl; + exit(0); + } +} diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu deleted file mode 100644 index 7e49fcb..0000000 --- a/src/dsaX_blas_interface.cu +++ /dev/null @@ -1,11 +0,0 @@ -#include -#include "dsaX_cublas_interface.h" - -void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) { -#ifdef DSA_XENGINE_TARGET_CUDA - dsaXHgemmStridedBatchedCuda((half*)real_in, (half*)imag_in, (half*)real_out, (half*)imag_out, param); -#else - std::cout "Not implemented" << std::endl; - exit(0); -#endif -} diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp index d2223f5..4611939 100644 --- a/src/dsaX_correlator.cpp +++ b/src/dsaX_correlator.cpp @@ -13,23 +13,22 @@ Workflow is similar for BF and corr applications #include "dsaX_blas_interface.h" #include "dsaX_utils.h" #include "dsaX_psrdada_utils.h" -#include "dsaX_cuda_interface.h" // correlator function // workflow: copy to device, reorder, stridedBatchedGemm, reorder -// DMH CUDA references excised +// DMH CUDA references excised. void dcorrelator(dmem *d) { // copy to device - dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); // zero out output arrays - dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); - dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); + dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short + dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); - // reorder input - reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i); + // reorder input into real and imaginary arrays of 2 byte data + reorderInput(d); dsaXBLASParam blas_param; // gemm settings @@ -51,9 +50,8 @@ void dcorrelator(dmem *d) { blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; // Perform GEMM accoring to back end configuration - dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); + dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); // reorder output data - reorder_output_device(d); - + reorderOutput(d); } diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu index df6b3de..17a2c9b 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/dsaX_cublas_interface.cu @@ -1,9 +1,10 @@ #include -#include "dsaX_cublas_interface.h" +#include "dsaX.h" +#include "dsaX_cuda_headers.h" using namespace std; -void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam blas_param) { +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) { #ifdef DSA_XENGINE_TARGET_CUDA // not sure if essential @@ -45,51 +46,60 @@ void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, h const int n = blas_param.n; const int k = blas_param.k; const half alpha = blas_param.alpha.real(); - const half malpha = -1.0 * blas_param.alpha.real(); + const half malpha = (-1.0 * blas_param.alpha.real()); const int lda = blas_param.lda; const int ldb = blas_param.ldb; const half beta0 = blas_param.beta.real(); const half beta1 = 1.0; const int ldc = blas_param.ldc; + const long long int a_offset = blas_param.a_offset; + const long long int b_offset = blas_param.b_offset; + const long long int c_offset = blas_param.c_offset; const long long int strideA = blas_param.a_stride; const long long int strideB = blas_param.b_stride; const long long int strideC = blas_param.c_stride; const int batchCount = blas_param.batch_count; - // run strided batched gemm for datatype (a + ib)(c + id) + // Run strided batched gemm for datatype + // (a + ib)(c + id) = (ac - bd) + i(bc + ad) + // on matrices alpha * op(A) * op(B) + beta * C + // where op(M) is defined by the transposition variable + // cublasOperation_t transM + + // Accumulate results into C matrix // ac - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,real_in,lda,strideA, - real_in,ldb,strideB,&beta0, - real_out,ldc,strideC, + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + (half *)real_a + a_offset, lda, strideA, + (half *)real_b + b_offset, ldb, strideB, &beta0, + (half *)real_c + c_offset, ldc, strideC, batchCount); - // bd - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,imag_in,lda,strideA, - imag_in,ldb,strideB,&beta1, - real_out,ldc,strideC, + // -bd + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &malpha, + (half*)imag_a + a_offset, lda, strideA, + (half*)imag_b + b_offset, ldb, strideB, &beta1, + (half*)real_c + c_offset, ldc, strideC, batchCount); - // -bc - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &malpha,imag_in,lda,strideA, - real_in,ldb,strideB,&beta0, - imag_out,ldc,strideC, + // bc + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + (half*)imag_a + a_offset, lda, strideA, + (half*)real_b + b_offset, ldb, strideB, &beta0, + (half*)imag_c + c_offset, ldc, strideC, batchCount); // ad - cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, - &alpha,real_in,lda,strideA, - imag_in,ldb,strideB,&beta1, - imag_out,ldc,strideC, + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + (half*)real_a + a_offset, lda, strideA, + (half*)imag_b + b_offset, ldb, strideB, &beta1, + (half*)imag_c + c_offset, ldc, strideC, batchCount); - + // shown to be essential cudaDeviceSynchronize(); - + // destroy stream cudaStreamDestroy(stream); cublasDestroy(cublasH); #else - std::cout "Not implemented" << std::endl; + std::cout "dsaX not built with CUDA target." << std::endl; exit(0); #endif } diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index d1f77a4..8eda8ae 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -1,7 +1,14 @@ +#include +#include + +#include "dsaX_cuda_headers.h" #include "dsaX_cuda_interface.h" +#include "dsaX_cuda_kernels.h" + +using namespace std; // allocate device memory -void initialize_device_memory(dmem *d, int bf) { +void initializeCudaMemory(dmem *d, int bf) { // for correlator if (bf==0) { @@ -46,7 +53,7 @@ void initialize_device_memory(dmem *d, int bf) { } } // deallocate device memory -void deallocate_device_memory(dmem *d, int bf) { +void deallocateCudaMemory(dmem *d, int bf) { cudaFree(d->d_input); @@ -83,13 +90,13 @@ void deallocate_device_memory(dmem *d, int bf) { // the corr matrices are column major order // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel -void reorder_output_device(dmem * d) { +void reorderOutputCuda(dmem * d) { // transpose input data - dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32); - transpose_matrix<<>>(d->d_outr,d->d_tx_outr); - transpose_matrix<<>>(d->d_outi,d->d_tx_outi); - + dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32); + transpose_matrix<<>>((half*)d->d_outr, (half*)d->d_tx_outr); + transpose_matrix<<>>((half*)d->d_outi, (half*)d->d_tx_outi); + // look at output /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); @@ -140,8 +147,8 @@ void reorder_output_device(dmem * d) { cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice); // run kernel to finish things - corr_output_copy<<>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs); - + corr_output_copy<<>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs); + /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); FILE *fout; @@ -154,70 +161,7 @@ void reorder_output_device(dmem * d) { //cudaStreamDestroy(stream); } -// kernel to fluff input -// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks -__global__ void corr_input_copy(char *input, half *inr, half *ini) { - - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 - int tidx = threadIdx.x; // assume 128 threads per block - int iidx = bidx*128+tidx; - - // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr - // to get real part 4 bit data. - // 0000rrrr - // Bit shift this result by 4 to the left. - // rrrr0000 - // Cast to signed char. - // +-rrr0000 - // Bitshift mantisa only to the right by 4 bits - // +-0000rrr - // Cast to float and use CUDA intrinsic to cast to signed half - inr[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(15) ) << 4) >> 4)); - - // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr - // to get imag part 4 bit data - // iiii0000. - // Cast to signed char - // +-iii0000 - // Bitshift mantisa only to the right by 4 bits - // +-0000iii - // Cast to float and use CUDA intrinsic to cast to signed half - ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); - - // Both results should be half (FP16) integers between -8 and 7. - half re = inr[iidx]; - half im = ini[iidx]; - half lim = 2.; - if( (re > lim || re < -lim) || (im > lim || im < -lim)) { - //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); - } -} - -// transpose kernel -// assume breakdown into tiles of 32x32, and run with 32x8 threads per block -// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) -// here, width is the dimension of the fastest index -template __global__ void transpose_matrix(in_prec * idata, out_prec * odata) { - - __shared__ in_prec tile[32][33]; - - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; - - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - - __syncthreads(); - - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - -} // function to copy and reorder d_input to d_r and d_i @@ -225,42 +169,14 @@ template __global__ void transpose_matrix( // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. // then fluffs using simple kernel -void reorder_input_device(char *input, char * tx, half *inr, half *ini) { - +void reorderInputCuda(dmem *d) { + // transpose input data dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix<<>>(input, tx); - corr_input_copy<<>>(tx, inr, ini); + transpose_matrix<<>>(d->d_input, d->d_tx); + corr_input_copy<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); } -// kernel to help with reordering output -// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac] -// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads -__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) { - - int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 - int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; - - int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); - int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); - int ch = (int)(chpol / 2); - int base_idx = indices_lookup[baseline]; - int iidx = base_idx * NCHAN_PER_PACKET + ch; - int pol = (int)(chpol % 2); - - float v1=0., v2=0.; - - // Use CUDA casting intrinsic __half2float - for (int i=0;i &dim_block_in, + std::vector &dim_grid_in) { - x = blockIdx.y * 16 + threadIdx.x; // transpose block offset - y = blockIdx.x * 16 + threadIdx.y; - width = gridDim.y * 16; - - for (int j = 0; j < 16; j += 8) { - odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0]; - odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1]; - odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2]; - odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3]; - } + // Create CUDA objects for launch + dim3 dim_block(dim_block_in[0], dim_block_in[1]); + dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]); + // Launch kernel + transpose_input_beamformer<<>>(idata, odata); } -// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] -// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads -__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { - - int bidx = blockIdx.x; - int tidx = threadIdx.x; - int inidx = bidx*128+tidx; - - // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) - - // get indices - int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); - int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2))); - int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2))); - int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2))); - int bm = (int)(idx / (128*(NANTS/2))); - int tactp = (int)(idx % (128*(NANTS/2))); - //int t = (int)(tactp / (32*(NANTS/2))); - int actp = (int)(tactp % (32*(NANTS/2))); - int a = (int)(actp / 32); - int ctp = (int)(actp % 32); - //int c = (int)(ctp / 4); - int tp = (int)(ctp % 4); - //int t2 = (int)(tp / 2); - int pol = (int)(tp % 2); - int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2; - - // calculate weights - float theta, afac, twr, twi; - if (iArm==0) { - theta = sep*(127.-bm*1.)*PI/10800.; // radians - afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate - twr = cos(afac*antpos_e[a+48*iArm]); - twi = sin(afac*antpos_e[a+48*iArm]); - wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); - wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); - //wr[inidx] = __float2half(calibs[widx]); - //wi[inidx] = __float2half(calibs[widx+1]); - } - if (iArm==1) { - theta = sep*(127.-bm*1.)*PI/10800.; // radians - afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate - twr = cos(afac*antpos_n[a+48*iArm]); - twi = sin(afac*antpos_n[a+48*iArm]); - wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1])); - wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1])); - //wr[inidx] = __float2half(calibs[widx]); - //wi[inidx] = __float2half(calibs[widx+1]); - } - -} // GPU-powered function to populate weights matrix for beamformer // file format: // sequential pairs of eastings and northings // then [NANTS, 48, R/I] calibs -void calc_weights(dmem *d) { +void calcWeightsCuda(dmem *d) { // allocate float *antpos_e = (float *)malloc(sizeof(float)*NANTS); @@ -405,7 +251,7 @@ void calc_weights(dmem *d) { cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice); // run kernel to populate weights matrix - populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs); + populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128, 128>>>(d_antpos_e, d_antpos_n, d_calibs, (half*)d->weights_r, (half*)d->weights_i, d->d_freqs); // free stuff cudaFree(d_antpos_e); @@ -419,83 +265,61 @@ void calc_weights(dmem *d) { // kernel to fluff input bf data // run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads -__global__ void fluff_input_bf(char * input, half * dr, half * di) { - - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 - int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; +void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb) { - dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); - di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); - - // Both results should be half (FP16) integers between -8 and 7. - //half re = dr[idx]; - //half im = di[idx]; - //half lim = 0; - //if( (re > lim || re < -lim) || (im > lim || im < -lim)) { - //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); - //} - - + // Launch kernel + fluff_input_beamformer<<>>(input, (half*)b_real, (half*)b_imag); } // transpose, add and scale kernel for bf // assume breakdown into tiles of 16x16, and run with 16x8 threads per block // launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16) // scf is a per-beam scale factor to enable recasting as unsigned char -__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) { - - __shared__ float tile[16][17]; +void transposeScaleBeamformerCuda(void *ir, void *ii, unsigned char *odata, std::vector &dim_block_in, + std::vector &dim_grid_in) { - int x = blockIdx.x * 16 + threadIdx.x; - int y = blockIdx.y * 16 + threadIdx.y; - int width = gridDim.x * 16; - float dr, di; - - for (int j = 0; j < 16; j += 8) { - dr = (float)(ir[(y+j)*width + x]); - di = (float)(ii[(y+j)*width + x]); - tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di); - } - - __syncthreads(); + // Create CUDA objects for launch + dim3 dim_block(dim_block_in[0], dim_block_in[1]); + dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]); + + // Launch kernel + transpose_scale_beamformer<<>>((half*)ir, (half*)ii, odata); +} - x = blockIdx.y * 16 + threadIdx.x; // transpose block offset - y = blockIdx.x * 16 + threadIdx.y; - width = gridDim.y * 16; +// sum over all times in output beam array +// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads +void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) { - for (int j = 0; j < 16; j += 8) - odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + // Launch kernel + sum_beam<<>>(input, output); +} +void dsaXmemsetCuda(void *array, int ch, size_t n){ + cudaMemset(array, ch, n); } -// sum over all times in output beam array -// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads -__global__ void sum_beam(unsigned char * input, float * output) { - - __shared__ float summ[512]; - int bidx = blockIdx.x; - int tidx = threadIdx.x; - //int idx = bidx*256+tidx; - int bm = (int)(bidx/48); - int ch = (int)(bidx % 48); - - summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]); - - __syncthreads(); - - if (tidx<256) { - summ[tidx] += summ[tidx+256]; - summ[tidx] += summ[tidx+128]; - summ[tidx] += summ[tidx+64]; - summ[tidx] += summ[tidx+32]; - summ[tidx] += summ[tidx+16]; - summ[tidx] += summ[tidx+8]; - summ[tidx] += summ[tidx+4]; - summ[tidx] += summ[tidx+2]; - summ[tidx] += summ[tidx+1]; +void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ + cudaError error = cudaSuccess; + switch(kind) { + case dsaXMemcpyHostToHost: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost); + break; + case dsaXMemcpyHostToDevice: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice); + break; + case dsaXMemcpyDeviceToHost: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost); + break; + case dsaXMemcpyDeviceToDevice: + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice); + break; + default: + std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl; } + if(error != cudaSuccess) cudaGetLastError(); +} - if (tidx==0) output[bidx] = summ[tidx]; - +void dsaXDeviceSynchronizeCuda() { + cudaDeviceSynchronize(); } + diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp new file mode 100644 index 0000000..c0c461c --- /dev/null +++ b/src/dsaX_interface.cpp @@ -0,0 +1,69 @@ +#include +#include +#include + +#include "dsaX_cuda_interface.h" +#include "dsaX_ftd.h" + +using namespace std; + +void dsaXCorrelator(void *output_data, void *input_data) { + dmem d; + int bf = 0; +#if DSA_XENGINE_TARGET_CUDA + initializeCudaMemory(&d, bf); + d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + dcorrelator(&d); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void reorderInput(dmem *d) { +#if DSA_XENGINE_TARGET_CUDA + reorderInputCuda(d); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void reorderOutput(dmem *d) { +#if DSA_XENGINE_TARGET_CUDA + reorderOutputCuda(d); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void transposeInputBeamformer(double *input, double *output, std::vector &dimBlock, std::vector &dimGrid) { +#if DSA_XENGINE_TARGET_CUDA + transposeInputBeamformerCuda(input, output, dimBlock, dimGrid); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, std::vector &dimBlock, std::vector &dimGrid) { +#if DSA_XENGINE_TARGET_CUDA + transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb) { +#if DSA_XENGINE_TARGET_CUDA + fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} + +void sumBeam(unsigned char *input, float *output, int blocks, int tpb) { +#if DSA_XENGINE_TARGET_CUDA + sumBeamCuda(input, output, blocks, tpb); +#else + std::cout << "dsaX error: not implemented" << std::endl; +#endif +} diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu new file mode 100644 index 0000000..8f86525 --- /dev/null +++ b/src/dsaX_magma_interface.cu @@ -0,0 +1,23 @@ +#include +#include "dsaX.h" +#include "dsaX_cuda_headers.h" + +#include "magma_v2.h" + +using namespace std; + +void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) { +#if defined (DSA_XENGINE_TARGET_CUDA) +#if defined (DSA_XENGINE_ENABLE_MAGMA) + + // TO DO + +#else + std::cout "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl; + exit(0); +#endif +#else + std::cout "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl; + exit(0); +#endif +} diff --git a/src/dsaX_psrdada_utils.cpp b/src/dsaX_psrdada_utils.cpp new file mode 100644 index 0000000..07c16e6 --- /dev/null +++ b/src/dsaX_psrdada_utils.cpp @@ -0,0 +1,11 @@ +#include "dsaX_psrdada_utils.h" + +void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out) +{ + if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in"); + dada_hdu_destroy (in); + + if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out"); + dada_hdu_destroy (out); + +} diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp index fc0345a..54e849a 100644 --- a/src/dsaX_utils.cpp +++ b/src/dsaX_utils.cpp @@ -1,39 +1,29 @@ #include "dsaX_utils.h" -#ifdef DSA_XENGINE_TARGET_CUDA -#include "dsaX_cuda_headers.h" -#endif +#include "dsaX_enums.h" +#include "dsaX_cuda_interface.h" void dsaXmemset(void *array, int ch, size_t n){ #ifdef DSA_XENGINE_TARGET_CUDA - cudaMemset(array, ch, n); + dsaXmemsetCuda(array, ch, n); #else - emset(array, ch, n); + memset(array, ch, n); #endif } -void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n){ +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ #ifdef DSA_XENGINE_TARGET_CUDA // Perform host to device memcopy on data - cudaMemcpy(array_device, array_host, n, cudaMemcpyHostToDevice); + dsaXmemcpyCuda(array_out, array_in, n, kind); #else - memcpy(array_device, array_host, n); + memcpy(array_out, array_in, n); #endif } -void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){ +void dsaXDeviceSynchronize() { #ifdef DSA_XENGINE_TARGET_CUDA // Perform host to device memcopy on data - cudaMemcpy(array_host, array_device, n, cudaMemcpyDeviceToHost); -#else - memcpy(array_host, array_device, n); -#endif -} - -void dsaXmemcpyDeviceToDevice(void *array_copy_to, void *array_copy_from, size_t n){ -#ifdef DSA_XENGINE_TARGET_CUDA - // Perform device to device memcopy on data - cudaMemcpy(array_copy_to, array_copy_from, n, cudaMemcpyDeviceToDevice); -#else - memcpy(array_copy_to, array_copy_from, n); + dsaXDeviceSynchronizeCuda(); +#else + // NO OP #endif } diff --git a/src/version.cpp b/src/version.cpp new file mode 100644 index 0000000..1c8114b --- /dev/null +++ b/src/version.cpp @@ -0,0 +1,5 @@ +#ifdef GITVERSION +const char* gitversion = GITVERSION ; +#else +const char* gitversion; +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4a45a24..9320850 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,6 @@ #DMH: fix include path -include_directories(../include) +include_directories(${CMAKE_SOURCE_DIR}/include) include_directories(${CLI11_SOURCE_DIR}/include/CLI) -add_executable(dsaX_correlator_test dsaX_correlator_test.cpp) +add_executable(dsaX_correlator_test dsaX_correlator_test.cpp) +target_link_libraries(dsaX_correlator_test dsax) diff --git a/tests/CMakeLists.txt~ b/tests/CMakeLists.txt~ deleted file mode 100644 index f72156b..0000000 --- a/tests/CMakeLists.txt~ +++ /dev/null @@ -1,5 +0,0 @@ - -#include_directories(../include) -include_directories(${CLI11_SOURCE_DIR}/src) -add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp) - diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp new file mode 100644 index 0000000..c067ced --- /dev/null +++ b/tests/command_line_params.cpp @@ -0,0 +1,17 @@ +#include + +void usage() { + fprintf (stdout, + "dsaX_beamformer_correlator [options]\n" + " -c core bind process to CPU core [no default]\n" + " -d send debug messages to syslog\n" + " -i in_key [default REORDER_BLOCK_KEY]\n" + " -o out_key [default XGPU_BLOCK_KEY]\n" + " -b run beamformer [default is to run correlator]\n" + " -h print usage\n" + " -t binary file for test mode\n" + " -f flagants file\n" + " -a calib file\n" + " -s start frequency (assumes -0.244140625MHz BW)\n"); +} + diff --git a/tests/dsaX_beamformer_correlator_test.cpp~ b/tests/dsaX_beamformer_correlator_test.cpp~ deleted file mode 100644 index 30184b3..0000000 --- a/tests/dsaX_beamformer_correlator_test.cpp~ +++ /dev/null @@ -1,398 +0,0 @@ -#include -#include -#include -#include -#include - -// Include the dsaX.h header in your application -//#include - -int main(int argc, char **argv) { - - // startup syslog message - // using LOG_LOCAL0 - openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0); - syslog (LOG_NOTICE, "Program started by User %d", getuid ()); - - // DADA Header plus Data Unit - dada_hdu_t* hdu_in = 0; - dada_hdu_t* hdu_out = 0; - - // data block HDU keys - key_t in_key = REORDER_BLOCK_KEY; - key_t out_key = XGPU_BLOCK_KEY; - - // command line arguments - int core = -1; - int arg = 0; - int bf = 0; - int test = 0; - char ftest[200], fflagants[200], fcalib[200]; - float sfreq = 1498.75; - - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) - { - switch (arg) - { - case 'c': - if (optarg) - { - core = atoi(optarg); - break; - } - else - { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) - { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) - { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) - { - test = 1; - syslog(LOG_INFO, "test mode"); - if (sscanf (optarg, "%s", &ftest) != 1) { - syslog(LOG_ERR, "could not read test file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'a': - if (optarg) - { - syslog(LOG_INFO, "read calib file %s",optarg); - if (sscanf (optarg, "%s", &fcalib) != 1) { - syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-a flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) - { - syslog(LOG_INFO, "reading flag ants file %s",optarg); - if (sscanf (optarg, "%s", &fflagants) != 1) { - syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 's': - if (optarg) - { - sfreq = atof(optarg); - syslog(LOG_INFO, "start freq %g",sfreq); - break; - } - else - { - syslog(LOG_ERR,"-s flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - //DEBUG=1; - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'b': - bf=1; - syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } - - // Bind to cpu core - if (core >= 0) { - if (dada_bind_thread_to_core(core) < 0) - syslog(LOG_ERR,"failed to bind to core %d", core); - syslog(LOG_NOTICE,"bound to core %d", core); - } - - /* - // allocate device memory - dmem d; - initialize_device_memory(&d,bf); - - // set up for beamformer - FILE *ff; - int iii; - if (bf) { - - if (!(ff=fopen(fflagants,"r"))) { - syslog(LOG_ERR,"could not open flagants file\n"); - exit(1); - } - d.nflags=0; - while (!feof(ff)) { - fscanf(ff,"%d\n",&d.flagants[iii]); - d.nflags++; - } - fclose(ff); - - if (!(ff=fopen(fcalib,"rb"))) { - syslog(LOG_ERR,"could not open calibss file\n"); - exit(1); - } - fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff); - fclose(ff); - - for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++) - d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.); - cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); - - // calculate weights - calc_weights(&d); - - } - - // test mode - FILE *fin, *fout; - uint64_t output_size; - char * output_data;//, * o1; - if (test) { - - // read one block of input data - d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - for (int i=0;i<512;i++) { - fin = fopen(ftest,"rb"); - fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); - fclose(fin); - } - - // run correlator or beamformer, and output data - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); - - fout = fopen("output.dat","wb"); - fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); - fclose(fout); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); - - // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); - // o1 = (char *)malloc(output_size); - // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost); - - - - fout = fopen("output.dat","wb"); - fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout); - //fwrite(o1,1,output_size,fout); - fclose(fout); - } - - - // free - free(d.h_input); - free(output_data); - //free(o1); - deallocate_device_memory(&d,bf); - - exit(1); - } - - - - - // DADA stuff - - syslog (LOG_INFO, "creating in and out hdus"); - - hdu_in = dada_hdu_create (0); - dada_hdu_set_key (hdu_in, in_key); - if (dada_hdu_connect (hdu_in) < 0) { - syslog (LOG_ERR,"could not connect to dada buffer in"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_read (hdu_in) < 0) { - syslog (LOG_ERR,"could not lock to dada buffer in"); - return EXIT_FAILURE; - } - - hdu_out = dada_hdu_create (0); - dada_hdu_set_key (hdu_out, out_key); - if (dada_hdu_connect (hdu_out) < 0) { - syslog (LOG_ERR,"could not connect to output buffer"); - return EXIT_FAILURE; - } - if (dada_hdu_lock_write(hdu_out) < 0) { - syslog (LOG_ERR, "could not lock to output buffer"); - return EXIT_FAILURE; - } - - uint64_t header_size = 0; - - // deal with headers - char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size); - if (!header_in) - { - syslog(LOG_ERR, "could not read next header"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - if (ipcbuf_mark_cleared (hdu_in->header_block) < 0) - { - syslog (LOG_ERR, "could not mark header block cleared"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - char * header_out = ipcbuf_get_next_write (hdu_out->header_block); - if (!header_out) - { - syslog(LOG_ERR, "could not get next header block [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - memcpy (header_out, header_in, header_size); - if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0) - { - syslog (LOG_ERR, "could not mark header block filled [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state"); - - // get block sizes and allocate memory - uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block); - uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); - if (bf==0) - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); - else - syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS); - uint64_t bytes_read = 0; - //char * block; - char * output_buffer; - output_buffer = (char *)malloc(block_out); - uint64_t written, block_id; - - // get things started - bool observation_complete=0; - //bool started = 0; - syslog(LOG_INFO, "starting observation"); - int blocks = 0; - //clock_t begin, end; - //double time_spent; - - while (!observation_complete) { - - if (DEBUG) syslog(LOG_INFO,"reading block"); - - // open block - d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id); - - // do stuff - //begin = clock(); - // loop - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost); - } - else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost); - } - //end = clock(); - //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; - cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; - - // write to output - - // write to host - written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); - if (written < block_out) - { - syslog(LOG_ERR, "main: failed to write all data to datablock [output]"); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - return EXIT_FAILURE; - } - - if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); - blocks++; - // loop end - - - // finish up - if (bytes_read < block_size) - observation_complete = 1; - - ipcio_close_block_read (hdu_in->data_block, bytes_read); - - } - - // finish up - free(output_buffer); - deallocate_device_memory(&d,bf); - dsaX_dbgpu_cleanup (hdu_in, hdu_out); - - return 0; - */ -} diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index b0560fc..a1d96c5 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -7,8 +7,8 @@ #include #include -// Include the dsaX_interface.h header in your application -#include +// Include the dsaX.h header in your application +#include using namespace std; @@ -110,12 +110,11 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } break; - } else - { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } + } else { + syslog(LOG_ERR,"-f flag requires argument"); + usage(); + return EXIT_FAILURE; + } case 's': if (optarg) { sfreq = atof(optarg); @@ -151,45 +150,42 @@ int main(int argc, char **argv) { std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; -#if 0 - dsaX_init(); - - // allocate device memory - dmem d; - initialize_device_memory(&d, bf); - + //dsaX_init(); FILE *fin, *fout; - uint64_t output_size; - char * output_data; + std::cout << "Creating float output_array of size " << sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl; + uint64_t output_size = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4; + std::cout << "Creating char input_array of size " << sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 << std::endl; + uint64_t input_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + float *output_data = (float *)malloc(output_size); + char *input_data = (char *)malloc(input_size); + // read one block of input data - d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); for (int i=0;i<512;i++) { fin = fopen(ftest,"rb"); - fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); + fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin); fclose(fin); } + + // Peek at input data (delete after development is complete) + for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; - // run correlator or beamformer, and output data + // run correlator and record output data syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - syslog(LOG_INFO,"copy to host"); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + dsaXCorrelator((void*)output_data, (void*)input_data); + + // Peek at output data (delete after development is complete) + //for (int i=0; i Date: Tue, 25 Jun 2024 22:57:18 -0700 Subject: [PATCH 21/30] build tweaks --- CMakeLists.txt | 2 +- include/dsaX_cuda_headers.h | 2 ++ include/dsaX_magma_headers.h | 5 +++++ src/dsaX_magma_interface.cu | 7 +++---- tests/dsaX_correlator_test.cpp | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 include/dsaX_magma_headers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index acfd1a3..e3cf1b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,7 +242,7 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA) FetchContent_MakeAvailable(PSRDada) else() # Find and link to local install - find_package(psrdada REQUIRED) + find_package(PSRDada REQUIRED) endif() # Get CLI11 dependency diff --git a/include/dsaX_cuda_headers.h b/include/dsaX_cuda_headers.h index acc838d..333a5bc 100644 --- a/include/dsaX_cuda_headers.h +++ b/include/dsaX_cuda_headers.h @@ -1,6 +1,8 @@ #pragma once +#if defined (DSA_XENGINE_TARGET_CUDA) #include #include "cuda_fp16.h" #include #include +#endif diff --git a/include/dsaX_magma_headers.h b/include/dsaX_magma_headers.h new file mode 100644 index 0000000..e9750c8 --- /dev/null +++ b/include/dsaX_magma_headers.h @@ -0,0 +1,5 @@ +#pragma once + +#if defined (DSA_XENGINE_ENABLE_MAGMA) +#include "magma_v2.h" +#endif diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu index 8f86525..14a8f4f 100644 --- a/src/dsaX_magma_interface.cu +++ b/src/dsaX_magma_interface.cu @@ -1,8 +1,7 @@ #include #include "dsaX.h" #include "dsaX_cuda_headers.h" - -#include "magma_v2.h" +#include "dsaX_magma_headers.h" using namespace std; @@ -13,11 +12,11 @@ void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void // TO DO #else - std::cout "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl; + std::cout << "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl; exit(0); #endif #else - std::cout "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl; + std::cout << "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl; exit(0); #endif } diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index a1d96c5..e9d0192 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -168,7 +168,7 @@ int main(int argc, char **argv) { } // Peek at input data (delete after development is complete) - for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; + //for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; // run correlator and record output data syslog(LOG_INFO,"run correlator"); From 95f751203a6f0bd9802da08e9927cc017482e4f1 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Wed, 26 Jun 2024 16:54:34 -0700 Subject: [PATCH 22/30] fix bug in beamformer, add util to inspect char length data in test --- src/dsaX_beamformer.cpp | 2 +- tests/dsaX_correlator_test.cpp | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp index f395b0e..f82f677 100644 --- a/src/dsaX_beamformer.cpp +++ b/src/dsaX_beamformer.cpp @@ -105,7 +105,7 @@ void dbeamformer(dmem *d) { // Reuse dimBlock //DMH: Abstract kernel launch parameters dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16; - dimGrid[0] = (NCHAN_PER_PACKET/8)/16; + dimGrid[1] = (NCHAN_PER_PACKET/8)/16; begin = clock(); transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid); end = clock(); diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index a1d96c5..b705975 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -26,6 +26,13 @@ void usage() { " -s start frequency (assumes -0.244140625MHz BW)\n"); } +void inspectPackedData(char input) { + + std::cout << "vals = (" << (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4) << ","; + + std::cout << (float)((char)(( (unsigned char)(input) & (unsigned char)(240) )) >> 4) << ")" << std::endl; +} + int main(int argc, char **argv) { // data block HDU keys @@ -168,14 +175,14 @@ int main(int argc, char **argv) { } // Peek at input data (delete after development is complete) - for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; + //for (int i=0; i Date: Thu, 27 Jun 2024 18:54:33 -0700 Subject: [PATCH 23/30] New correlator function reproduces legacy implementation data. Added CLI for command line parsing --- include/dsaX.h | 8 + include/dsaX_cuda_interface.h | 2 +- include/dsaX_cuda_kernels.h | 73 +++++--- include/dsaX_enums.h | 3 +- legacy/Makefile | 4 +- legacy/dsaX_bfCorr.cu | 298 +++++++++++++++++++++----------- src/dsaX_correlator.cpp | 64 ++++++- src/dsaX_cublas_interface.cu | 110 ++++++++---- src/dsaX_cuda_interface.cu | 30 +++- src/dsaX_interface.cpp | 46 +++++ tests/CMakeLists.txt | 4 +- tests/command_line_params.cpp | 55 ++++-- tests/dsaX_correlator_test.cpp | 299 ++++++++++++++++++--------------- 13 files changed, 680 insertions(+), 316 deletions(-) diff --git a/include/dsaX.h b/include/dsaX.h index 699fe37..6083bb2 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -5,6 +5,8 @@ #include "dsaX_def.h" #include "dsaX_enums.h" +#define OLD_BLAS + // Structure that carries BLAS parameters typedef struct dsaXBLASParam_s { size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ @@ -38,6 +40,8 @@ typedef struct dsaXBLASParam_s { } dsaXBLASParam; +void printDsaXBLASParam(const dsaXBLASParam param); + // required to prevent overflow in corr matrix multiply #define halfFac 4 @@ -76,6 +80,10 @@ typedef struct dmem { } dmem; +void dsaXInit(int device_ordinal = 0); + +void inspectPackedData(char input, int i, bool non_zero = false); + void dsaXCorrelator(void *output_data, void *input_data); void reorderOutput(dmem *d); diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h index cee1581..d9f2278 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/dsaX_cuda_interface.h @@ -6,7 +6,7 @@ #include "dsaX_enums.h" #include "dsaX.h" - +void dsaXInitCuda(int dev); void initializeCudaMemory(dmem *d, int bf); diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h index db09baa..7fef077 100644 --- a/include/dsaX_cuda_kernels.h +++ b/include/dsaX_cuda_kernels.h @@ -2,6 +2,13 @@ #include "dsaX_cuda_headers.h" +__device__ void inspectPackedDataInKernel(char input, int i) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im); +} + // KERNELS // DMH: Abstract hardcoded launch parameters __global__ void transpose_input_beamformer(double *idata, double *odata) { @@ -40,7 +47,7 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128 int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; + int idx = blockDim.x * bidx + tidx; int baseline = (int)(idx / (NCHAN_PER_PACKET * 2)); int chpol = (int)(idx % (NCHAN_PER_PACKET * 2)); @@ -74,9 +81,11 @@ template __global__ void transpose_matrix( int y = blockIdx.y * 32 + threadIdx.y; int width = gridDim.x * 32; - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - + for (int j = 0; j < 32; j += 8) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + __syncthreads(); x = blockIdx.y * 32 + threadIdx.x; // transpose block offset @@ -88,14 +97,43 @@ template __global__ void transpose_matrix( } +// transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_char(char * idata, char * odata) { + + __shared__ char tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + + for (int j = 0; j < 32; j += 8) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + } +} + + // kernel to fluff input // run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks __global__ void corr_input_copy(char *input, half *inr, half *ini) { - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 - int tidx = threadIdx.x; // assume 128 threads per block - int iidx = bidx*128+tidx; - + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int iidx = blockDim.x * bidx + tidx; + // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr // to get real part 4 bit data. // 0000rrrr @@ -118,23 +156,18 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) { // Cast to float and use CUDA intrinsic to cast to signed half ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); - // Both results should be half (FP16) integers between -8 and 7. - half re = inr[iidx]; - half im = ini[iidx]; - half lim = 2.; - if( (re > lim || re < -lim) || (im > lim || im < -lim)) { - //printf("re = %f, im = %f\n", __half2float(re), __half2float(im)); - } + //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx])); } // kernel to populate an instance of weights matrix // [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] // run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads +// TUNABLE __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { int bidx = blockIdx.x; int tidx = threadIdx.x; - int inidx = bidx*128+tidx; + int inidx = 128 * bidx + tidx; // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2) @@ -183,10 +216,10 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa // run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads __global__ void fluff_input_beamformer(char * input, half * dr, half * di) { - int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 - int tidx = threadIdx.x; // assume 128 - int idx = bidx*128+tidx; - + int bidx = blockIdx.x; + int tidx = threadIdx.x; + int idx = blockDim.x * bidx + tidx; + dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h index 30fe3c6..4e8351f 100644 --- a/include/dsaX_enums.h +++ b/include/dsaX_enums.h @@ -12,7 +12,8 @@ typedef enum dsaXError_t { typedef enum dsaXBLASOperation_s { DSA_BLAS_OP_N = 0, // No transpose DSA_BLAS_OP_T = 1, // Transpose only - DSA_BLAS_OP_C = 2, // Conjugate transpose + DSA_BLAS_OP_A = 2, // Adjoint imaginary, no transpose + DSA_BLAS_OP_C = 3, // Conjugate transpose DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM } dsaXBLASOperation; diff --git a/legacy/Makefile b/legacy/Makefile index 0de1991..4cc2fee 100644 --- a/legacy/Makefile +++ b/legacy/Makefile @@ -4,13 +4,13 @@ CC=gcc CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc CDEPS1=dsaX_def.h dsaX_capture_manythread.h CDEPS2=dsaX_def.h dsaX_capture.h -LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu +LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib #-lcfitsio -lsigproc -lxgpu #LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap #CDEPS3=dsaX_def.h dsaX_capture_pcap.h CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++ -CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 +CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/dmhowart/install/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 -L/home/dmhowart/install/lib .DEFAULT_GOAL := all diff --git a/legacy/dsaX_bfCorr.cu b/legacy/dsaX_bfCorr.cu index 25b9262..265226b 100644 --- a/legacy/dsaX_bfCorr.cu +++ b/legacy/dsaX_bfCorr.cu @@ -45,13 +45,33 @@ using std::endl; #define sep 1.0 // arcmin /* global variables */ -int DEBUG = 1; +int DEBUG = 0; + +__device__ void inspectPackedDataInKernel(char input, int i) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im); +} + +void inspectPackedData(char input, int i, bool non_zeros) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(non_zeros) { + if(re != 0 || im != 0) + std::cout << "val["<data_block; + + // ensure that the data blocks are SHM locked + if (ipcbuf_lock (db) < 0) + { + syslog(LOG_ERR,"dada_dbregister: ipcbuf_lock failed"); + return -1; + } + + size_t bufsz = db->sync->bufsz; + unsigned int flags = 0; + cudaError_t rval; + + // lock each data block buffer as cuda memory + uint64_t ibuf; + for (ibuf = 0; ibuf < db->sync->nbufs; ibuf++) + { + rval = cudaHostRegister ((void *) db->buffer[ibuf], bufsz, flags); + if (rval != cudaSuccess) + { + syslog(LOG_ERR,"dada_dbregister: cudaHostRegister failed"); + return -1; + } + } + + return 0; +} + // allocate device memory void initialize(dmem * d, int bf) { // for correlator if (bf==0) { + cudaMallocHost((void**)&d->h_pinned_input, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); @@ -94,6 +149,14 @@ void initialize(dmem * d, int bf) { cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + + // timers + d->cp = 0.; + d->prep = 0.; + d->outp = 0.; + d->cubl = 0.; + + } // for beamformer @@ -141,6 +204,7 @@ void deallocate(dmem * d, int bf) { cudaFree(d->d_outi); cudaFree(d->d_tx_outr); cudaFree(d->d_tx_outi); + cudaFreeHost(d->h_pinned_input); } if (bf==1) { cudaFree(d->d_tx); @@ -195,7 +259,8 @@ fprintf (stdout, " -t binary file for test mode\n" " -f flagants file\n" " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); + " -s start frequency (assumes -0.244140625MHz BW)\n" + " -g observing DEC in degrees (default 71.66)\n"); } // kernel to fluff input @@ -209,6 +274,7 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) { inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4)); ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4)); + //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx])); } @@ -224,18 +290,21 @@ __global__ void transpose_matrix_char(char * idata, char * odata) { int y = blockIdx.y * 32 + threadIdx.y; int width = gridDim.x * 32; - for (int j = 0; j < 32; j += 8) + for (int j = 0; j < 32; j += 8) { tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); + } + __syncthreads(); x = blockIdx.y * 32 + threadIdx.x; // transpose block offset y = blockIdx.x * 32 + threadIdx.y; width = gridDim.y * 32; - for (int j = 0; j < 32; j += 8) + for (int j = 0; j < 32; j += 8) { odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - + //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x); + } } // arbitrary transpose kernel @@ -264,34 +333,8 @@ __global__ void transpose_matrix_float(half * idata, half * odata) { } -// arbitrary transpose kernel -// assume breakdown into tiles of 32x32, and run with 32x8 threads per block -// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) -// here, width is the dimension of the fastest index -template __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) { - - __shared__ in_prec tile[32][33]; - - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; - - for (int j = 0; j < 32; j += 8) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - - __syncthreads(); - - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; - - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; - -} - -// function to copy and reorder d_input to d_r and d_i +// function to copy amd reorder d_input to d_r and d_i // input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. @@ -300,7 +343,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) { // transpose input data dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix_char<<>>(input,tx); + transpose_matrix_char<<>>(input, tx); + // DMH good /* // set up for geam cublasHandle_t cublasH = NULL; @@ -452,21 +496,33 @@ void reorder_output(dmem * d) { // workflow: copy to device, reorder, stridedBatchedGemm, reorder void dcorrelator(dmem * d) { + // timing + // copy, prepare, cublas, output + clock_t begin, end; + // zero out output arrays cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half)); cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); - // copy to device + // copy to device + //memcpy(d->h_pinned_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + begin = clock(); cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice); - + end = clock(); + d->cp += (float)(end - begin) / CLOCKS_PER_SEC; + // reorder input + begin = clock(); reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i); - + // not sure if essential cudaDeviceSynchronize(); + end = clock(); + d->prep += (float)(end - begin) / CLOCKS_PER_SEC; // set up for gemm + begin = clock(); cublasHandle_t cublasH = NULL; cudaStream_t stream = NULL; cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); @@ -494,6 +550,10 @@ void dcorrelator(dmem * d) { const int batchCount = NCHAN_PER_PACKET*2*2*halfFac; // run strided batched gemm + // M^* M^T + // (a - ib)(a + ib)^T + // (aaT + bbT) + i(abT - bTa) + // ac cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k, &alpha,d->d_r,lda,strideA, @@ -521,13 +581,18 @@ void dcorrelator(dmem * d) { // shown to be essential cudaDeviceSynchronize(); + end = clock(); + d->cubl += (float)(end - begin) / CLOCKS_PER_SEC; // destroy stream cudaStreamDestroy(stream); cublasDestroy(cublasH); // reorder output data + begin = clock(); reorder_output(d); + end = clock(); + d->outp += (float)(end - begin) / CLOCKS_PER_SEC; } @@ -575,8 +640,8 @@ __global__ void fluff_input_bf(char * input, half * dr, half * di) { int tidx = threadIdx.x; // assume 128 int idx = bidx*128+tidx; - dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); - di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); + dr[idx] = __float2half(0.035*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4))); + di[idx] = __float2half(0.035*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4))); } @@ -606,7 +671,7 @@ __global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) width = gridDim.y * 16; for (int j = 0; j < 16; j += 8) - odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.); + odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]); } @@ -775,7 +840,7 @@ void dbeamformer(dmem * d) { // kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol] // run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads -__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) { +__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs, float dec) { int bidx = blockIdx.x; int tidx = threadIdx.x; @@ -813,7 +878,7 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa //wi[inidx] = __float2half(calibs[widx+1]); } if (iArm==1) { - theta = sep*(127.-bm*1.)*PI/10800.; // radians + theta = sep*(127.-bm*1.)*PI/10800.-(PI/180.)*dec; // radians afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate twr = cos(afac*antpos_n[a+48*iArm]); twi = sin(afac*antpos_n[a+48*iArm]); @@ -845,8 +910,8 @@ void calc_weights(dmem * d) { // deal with antpos and calibs int iant, found; for (int i=0;ih_winp[2*i]; - antpos_n[i] = d->h_winp[2*i+1]; + antpos_e[i] = d->h_winp[i]; + antpos_n[i] = d->h_winp[i+NANTS]; } for (int i=0;i>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs); + populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs,37.23-(d->obsdec)); // free stuff cudaFree(d_antpos_e); @@ -892,7 +957,7 @@ void calc_weights(dmem * d) { // MAIN -int main (int argc, char *argv[]) { +int main (int argc, char *argv[]) { cudaSetDevice(1); @@ -914,11 +979,12 @@ int main (int argc, char *argv[]) { int arg = 0; int bf = 0; int test = 0; + float mydec = 71.66; char ftest[200], fflagants[200], fcalib[200]; float sfreq = 1498.75; - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) + while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:g:bdh")) != -1) { switch (arg) { @@ -1026,12 +1092,26 @@ int main (int argc, char *argv[]) { usage(); return EXIT_FAILURE; } + case 'g': + if (optarg) + { + mydec = atof(optarg); + syslog(LOG_INFO, "obs dec %g",mydec); + break; + } + else + { + syslog(LOG_ERR,"-g flag requires argument"); + usage(); + return EXIT_FAILURE; + } case 'd': DEBUG=1; syslog (LOG_DEBUG, "Will excrete all debug messages"); break; case 'b': bf=1; + cudaSetDevice(0); syslog (LOG_NOTICE, "Running beamformer, NOT correlator"); break; case 'h': @@ -1080,55 +1160,84 @@ int main (int argc, char *argv[]) { cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice); // calculate weights + d.obsdec = mydec; calc_weights(&d); } // test mode FILE *fin, *fout; - uint64_t output_size; + uint64_t sz, output_size, in_block_size, rd_size; + in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; char * output_data, * o1; + int nreps = 1, nchunks = 1; if (test) { - // read one block of input data - d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - for (int i=0;i<512;i++) { - fin = fopen(ftest,"rb"); - fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin); - fclose(fin); - } + // read one block of input data - // run correlator or beamformer, and output data - if (bf==0) { - if (DEBUG) syslog(LOG_INFO,"run correlator"); - dcorrelator(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost); + // get size of file + fin=fopen(ftest,"rb"); + fseek(fin,0L,SEEK_END); + sz = ftell(fin); + rewind(fin); - fout = fopen("output.dat","wb"); - fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout); - fclose(fout); + // figure out how many reps and chunks to read with + if (sz>in_block_size) { + nreps = (int)(sz/in_block_size); + rd_size = in_block_size; } else { - if (DEBUG) syslog(LOG_INFO,"run beamformer"); - dbeamformer(&d); - if (DEBUG) syslog(LOG_INFO,"copy to host"); - output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS; - output_data = (char *)malloc(output_size); - cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost); + nchunks = (int)(in_block_size/sz); + rd_size = sz; + } - /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); - o1 = (char *)malloc(output_size); - cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/ - - + // allocate input + d.h_input = (char *)malloc(sizeof(char)*in_block_size); + + std::cout << "Size of input = " << in_block_size << std::endl; + + // loop over reps and chunks + for (int reps=0; reps0) rewind(fin); + fread(d.h_input+chunks*rd_size,rd_size,1,fin); + + std::cout << "Input peek " << std::endl; + //for (int i=0; i<8; i++) inspectPackedData(d.h_input[i], i); + + // run correlator or beamformer, and output data + if (bf==0) { + if (DEBUG) syslog(LOG_INFO,"run correlator"); + dcorrelator(&d); + if (DEBUG) syslog(LOG_INFO,"copy to host"); + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + cudaMemcpy(output_data, d.d_output, output_size, cudaMemcpyDeviceToHost); + + std::cout << "Output peek " << std::endl; + for(int i=0; idata_block); uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block); - syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out); + syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out); if (bf==0) syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4); else @@ -1235,7 +1347,6 @@ int main (int argc, char *argv[]) { // do stuff //begin = clock(); - // loop if (bf==0) { if (DEBUG) syslog(LOG_INFO,"run correlator"); dcorrelator(&d); @@ -1250,11 +1361,10 @@ int main (int argc, char *argv[]) { } //end = clock(); //time_spent = (double)(end - begin) / CLOCKS_PER_SEC; - cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; + //cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl; // write to output - - // write to host + written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out); if (written < block_out) { @@ -1265,13 +1375,13 @@ int main (int argc, char *argv[]) { if (DEBUG) syslog(LOG_INFO, "written block %d",blocks); blocks++; - // loop end + // finish up if (bytes_read < block_size) observation_complete = 1; - + ipcio_close_block_read (hdu_in->data_block, bytes_read); } diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp index 4611939..7a6882c 100644 --- a/src/dsaX_correlator.cpp +++ b/src/dsaX_correlator.cpp @@ -18,23 +18,31 @@ Workflow is similar for BF and corr applications // workflow: copy to device, reorder, stridedBatchedGemm, reorder // DMH CUDA references excised. void dcorrelator(dmem *d) { - - // copy to device - dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); // zero out output arrays dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + + // copy to device + dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); // reorder input into real and imaginary arrays of 2 byte data reorderInput(d); - + dsaXBLASParam blas_param; + blas_param.struct_size = sizeof(blas_param); + blas_param.blas_type = DSA_BLAS_GEMM; + // gemm settings // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] - // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] - blas_param.trans_a = DSA_BLAS_OP_N; + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] + +#if defined OLD_BLAS + std::cout << "Old params" << std::endl; + + blas_param.data_order = DSA_BLAS_DATAORDER_COL; + blas_param.trans_a = DSA_BLAS_OP_A; blas_param.trans_b = DSA_BLAS_OP_T; blas_param.m = NANTS; blas_param.n = NANTS; @@ -48,9 +56,53 @@ void dcorrelator(dmem *d) { blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; blas_param.c_stride = NANTS*NANTS; blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#else + std::cout << "My params" << std::endl; + + blas_param.data_order = DSA_BLAS_DATAORDER_ROW; + blas_param.trans_a = DSA_BLAS_OP_C; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#endif + + // Swap A and B if in row order + if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { + std::swap(blas_param.m, blas_param.n); + std::swap(blas_param.lda, blas_param.ldb); + std::swap(blas_param.trans_a, blas_param.trans_b); + std::swap(blas_param.a_offset, blas_param.b_offset); + std::swap(blas_param.a_stride, blas_param.b_stride); + //std::swap(A_data, B_data); + //std::swap(A_data, B_data); + } + + printDsaXBLASParam(blas_param); + + // DMH: fix me + blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS; + // Perform GEMM accoring to back end configuration dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param); + + //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i); // reorder output data reorderOutput(d); diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu index 17a2c9b..597cfbd 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/dsaX_cublas_interface.cu @@ -15,38 +15,13 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void cudaStream_t stream = NULL; cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cublasCreate(&cublasH); - cublasSetStream(cublasH, stream); + cublasSetStream(cublasH, stream); - // Transfer params - cublasOperation_t transa; - cublasOperation_t transb; - switch (blas_param.trans_a) { - case DSA_BLAS_OP_N: - transa = CUBLAS_OP_N; break; - case DSA_BLAS_OP_T: - transa = CUBLAS_OP_T; break; - case DSA_BLAS_OP_C: - transa = CUBLAS_OP_C; break; - default: - std::cout << "Unknown cublas transpose" << std::endl; - } - - switch (blas_param.trans_b) { - case DSA_BLAS_OP_N: - transb = CUBLAS_OP_N; break; - case DSA_BLAS_OP_T: - transb = CUBLAS_OP_T; break; - case DSA_BLAS_OP_C: - transb = CUBLAS_OP_C; break; - default: - std::cout << "Unknown cublas transpose" << std::endl; - } - + // Transfer params const int m = blas_param.m; const int n = blas_param.n; const int k = blas_param.k; - const half alpha = blas_param.alpha.real(); - const half malpha = (-1.0 * blas_param.alpha.real()); + const double alpha = blas_param.alpha.real(); const int lda = blas_param.lda; const int ldb = blas_param.ldb; const half beta0 = blas_param.beta.real(); @@ -59,7 +34,70 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void const long long int strideB = blas_param.b_stride; const long long int strideC = blas_param.c_stride; const int batchCount = blas_param.batch_count; - + + // NOTE: cublasHgemm is a real valued kernel. As a result, + // matrix conjugates must be handled by passing negative + // alpha values on the appropriate imaginary planar + // arrays. We discern these negative values while parsing + // transpose, adjoint and conjugation values. + cublasOperation_t transa; + cublasOperation_t transb; + int A_imag_alpha_sign = 1.0; + switch (blas_param.trans_a) { + case DSA_BLAS_OP_N: + transa = CUBLAS_OP_N; + break; + case DSA_BLAS_OP_T: + transa = CUBLAS_OP_T; + break; + case DSA_BLAS_OP_A: + transa = CUBLAS_OP_N; + // A array requests adjoint, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of A. + A_imag_alpha_sign *= -1; + break; + case DSA_BLAS_OP_C: + transa = CUBLAS_OP_T; + // A array requests conjugation, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of A. + A_imag_alpha_sign *= -1; + break; + default: + std::cout << "Unknown cublas transpose" << std::endl; + } + + int B_imag_alpha_sign = alpha; + switch (blas_param.trans_b) { + case DSA_BLAS_OP_N: + transb = CUBLAS_OP_N; + break; + case DSA_BLAS_OP_T: + transb = CUBLAS_OP_T; + break; + case DSA_BLAS_OP_A: + transb = CUBLAS_OP_N; + // B array requests adjoint, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of B. + B_imag_alpha_sign *= -1; + break; + case DSA_BLAS_OP_C: + transb = CUBLAS_OP_T; + // A array requests conjugation, hence we + // must apply supply a factor of -1 to alpha + // when dealing with the imaginary component + // of A. + B_imag_alpha_sign *= -1; + break; + default: + std::cout << "Unknown dsaBLAS transpose" << std::endl; + } + // Run strided batched gemm for datatype // (a + ib)(c + id) = (ac - bd) + i(bc + ad) // on matrices alpha * op(A) * op(B) + beta * C @@ -68,25 +106,29 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void // Accumulate results into C matrix // ac - cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + half alpha_ac = alpha; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ac), (half *)real_a + a_offset, lda, strideA, (half *)real_b + b_offset, ldb, strideB, &beta0, (half *)real_c + c_offset, ldc, strideC, batchCount); - // -bd - cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &malpha, + // -bd (minus sign from i*i) + half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign); + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd), (half*)imag_a + a_offset, lda, strideA, (half*)imag_b + b_offset, ldb, strideB, &beta1, (half*)real_c + c_offset, ldc, strideC, batchCount); // bc - cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + half alpha_bc = alpha * A_imag_alpha_sign; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc), (half*)imag_a + a_offset, lda, strideA, (half*)real_b + b_offset, ldb, strideB, &beta0, (half*)imag_c + c_offset, ldc, strideC, batchCount); // ad - cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha, + half alpha_ad = alpha * B_imag_alpha_sign; + cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad), (half*)real_a + a_offset, lda, strideA, (half*)imag_b + b_offset, ldb, strideB, &beta1, (half*)imag_c + c_offset, ldc, strideC, diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index 8eda8ae..1782752 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -7,6 +7,10 @@ using namespace std; +void dsaXInitCuda(int dev){ + cudaSetDevice(dev); +} + // allocate device memory void initializeCudaMemory(dmem *d, int bf) { @@ -93,10 +97,11 @@ void deallocateCudaMemory(dmem *d, int bf) { void reorderOutputCuda(dmem * d) { // transpose input data +#if defined (OLD_BLAS) dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32); transpose_matrix<<>>((half*)d->d_outr, (half*)d->d_tx_outr); transpose_matrix<<>>((half*)d->d_outi, (half*)d->d_tx_outi); - +#endif // look at output /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); @@ -144,10 +149,17 @@ void reorderOutputCuda(dmem * d) { ii++; } } - cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice); + cudaMemcpy(d_idxs, h_idxs, sizeof(int)*NBASE,cudaMemcpyHostToDevice); // run kernel to finish things - corr_output_copy<<>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs); + // TUNABLE + int blockDim = 128; + int blocks = NCHAN_PER_PACKET*2*NBASE/blockDim; +#if defined (OLD_BLAS) + corr_output_copy<<>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs); +#else + corr_output_copy<<>>((half*)d->d_outr, (half*)d->d_outi, d->d_output, d_idxs); +#endif /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); @@ -172,9 +184,17 @@ void reorderOutputCuda(dmem * d) { void reorderInputCuda(dmem *d) { // transpose input data +#if defined (OLD_BLAS) dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - transpose_matrix<<>>(d->d_input, d->d_tx); - corr_input_copy<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); + + // TUNABLE + int blockDim = 128; + int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; + transpose_matrix_char<<>>(d->d_input, d->d_tx); + corr_input_copy<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); +#else + corr_input_copy<<>>(d->d_input, (half*)d->d_r, (half*)d->d_i); +#endif } diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp index c0c461c..f17a6d8 100644 --- a/src/dsaX_interface.cpp +++ b/src/dsaX_interface.cpp @@ -3,11 +3,56 @@ #include #include "dsaX_cuda_interface.h" +#include "dsaX_utils.h" #include "dsaX_ftd.h" using namespace std; +void printDsaXBLASParam(const dsaXBLASParam param) { + + cout << "struct_size = " << param.struct_size << endl; + cout << "blas_type = " << param.blas_type << endl; + cout << "blas_lib = " << param.blas_lib << endl; + cout << "data_order = " << param.data_order << endl; + cout << "trans_a = " << param.trans_a << endl; + cout << "trans_b = " << param.trans_b << endl; + cout << "m = " << param.m << endl; + cout << "n = " << param.n << endl; + cout << "k = " << param.k << endl; + cout << "lda = " << param.lda << endl; + cout << "ldb = " << param.ldb << endl; + cout << "ldc = " << param.ldc << endl; + cout << "a_offset = " << param.a_offset << endl; + cout << "b_offset = " << param.b_offset << endl; + cout << "c_offset = " << param.c_offset << endl; + cout << "a_stride = " << param.a_stride << endl; + cout << "b_stride = " << param.b_stride << endl; + cout << "c_stride = " << param.c_stride << endl; + cout << "alpha = " << param.alpha << endl; + cout << "bets = " << param.alpha << endl; + cout << "batch_count = " << param.batch_count << endl; +} + +void dsaXInit(int dev){ +#if DSA_XENGINE_TARGET_CUDA + dsaXInitCuda(dev); +#endif +} + +void inspectPackedData(char input, int i, bool non_zeros) { + float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); + float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); + + if(non_zeros) { + if(re != 0 || im != 0) + std::cout << "val["< -void usage() { - fprintf (stdout, - "dsaX_beamformer_correlator [options]\n" - " -c core bind process to CPU core [no default]\n" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -b run beamformer [default is to run correlator]\n" - " -h print usage\n" - " -t binary file for test mode\n" - " -f flagants file\n" - " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); -} +// General +int core = 0; +bool debug = false; + +// Data block HDU keys +key_t in_key = REORDER_BLOCK_KEY; +key_t out_key = XGPU_BLOCK_KEY; + +// Test mode +bool run_beamformer = false; +bool run_correlator = false; +double start_frequency = 1498.75; + +// Test file +std::string test_filename; +int n_channels = 384; +int n_antennae = 63; +int n_pol = 2; +int n_times = 30720; + +std::shared_ptr make_app(std::string app_description, std::string app_name) { + auto dsaX_app = std::make_shared(app_description, app_name); + dsaX_app->option_defaults()->always_capture_default(); + + dsaX_app->add_option("--core", core, "Bind process to this CPU core [default 0]"); + dsaX_app->add_option("--debug", debug, "Send debug messages to syslog"); + dsaX_app->add_option("--in-key", in_key, "[default REORDER_BLOCK_KEY]"); + dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]"); + dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]"); + dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]"); + dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)"); + + // Input file options + dsaX_app->add_option("--test-filename", test_filename, "Name of file on which to run tests"); + dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]"); + dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]"); + dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]"); + dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]"); + + return dsaX_app; +} diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index b705975..966c269 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -7,140 +7,94 @@ #include #include +// Include this file to access input parameters +#include "command_line_params.h" + // Include the dsaX.h header in your application #include using namespace std; -void usage() { - fprintf (stdout, - "dsaX_beamformer_correlator [options]\n" - " -c if dsaX is CUDA enabled, use this GPU" - " -d send debug messages to syslog\n" - " -i in_key [default REORDER_BLOCK_KEY]\n" - " -o out_key [default XGPU_BLOCK_KEY]\n" - " -h print usage\n" - " -t binary file for test mode\n" - " -f flagants file\n" - " -a calib file\n" - " -s start frequency (assumes -0.244140625MHz BW)\n"); -} +// The class offers entire file content read/write in single operation +class BinaryFileVector : public std::vector +{ +public: -void inspectPackedData(char input) { - - std::cout << "vals = (" << (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4) << ","; - - std::cout << (float)((char)(( (unsigned char)(input) & (unsigned char)(240) )) >> 4) << ")" << std::endl; -} + using std::vector::vector; + + bool loadFromFile(const char *fileName) noexcept + { + // Try to open a file specified by its name + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (!file.is_open() || file.bad()) + return false; + + // Clear whitespace removal flag + file.unsetf(std::ios::skipws); + + // Determine size of the file + file.seekg(0, std::ios_base::end); + size_t fileSize = file.tellg(); + file.seekg(0, std::ios_base::beg); + + // Discard previous vector content + resize(0); + reserve(0); + shrink_to_fit(); + + // Order to prealocate memory to avoid unnecessary reallocations due to vector growth + reserve(fileSize); + + // Read entire file content into prealocated vector memory + insert(begin(), + std::istream_iterator(file), + std::istream_iterator()); + + // Make sure entire content is loaded + if(size() == fileSize) { + std::cout << "Successfully read file of size " << fileSize << std::endl; + return true; + } else { + std::cout << "Unexpected file size." << std::endl; + return false; + } + } + + bool saveToFile(const char *fileName) const noexcept + { + // Write entire vector content into a file specified by its name + std::ofstream file(fileName, std::ios::out | std::ios::binary); + try { + file.write((const char *) data(), size()); + } + catch (...) { + return false; + } + + // Determine number of bytes successfully stored in file + size_t fileSize = file.tellp(); + if(size() == fileSize) { + std::cout << "Successfully wrote file of size " << fileSize << std::endl; + return true; + } else { + std::cout << "Unexpected file size." << std::endl; + return false; + } + } +}; int main(int argc, char **argv) { - // data block HDU keys - key_t in_key = REORDER_BLOCK_KEY; - key_t out_key = XGPU_BLOCK_KEY; + // Parse command line + auto app = make_app(); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } // command line arguments int device_ordinal = 0; - int arg = 0; - int bf = 0; - char ftest[200], fflagants[200], fcalib[200]; - float sfreq = 1498.75; - - while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) { - switch (arg) { - case 'c': - if (optarg) { - device_ordinal = atoi(optarg); - break; - } - else { - syslog(LOG_ERR,"-c flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'i': - if (optarg) { - if (sscanf (optarg, "%x", &in_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } else { - syslog(LOG_ERR,"-i flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'o': - if (optarg) { - if (sscanf (optarg, "%x", &out_key) != 1) { - syslog(LOG_ERR, "could not parse key from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } else { - syslog(LOG_ERR,"-o flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 't': - if (optarg) { - syslog(LOG_INFO, "test mode"); - if (sscanf (optarg, "%s", &ftest) != 1) { - syslog(LOG_ERR, "could not read test file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } else { - syslog(LOG_ERR,"-t flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'a': - if (optarg) { - syslog(LOG_INFO, "read calib file %s",optarg); - if (sscanf (optarg, "%s", &fcalib) != 1) { - syslog(LOG_ERR, "could not read calib file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } - else { - syslog(LOG_ERR,"-a flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'f': - if (optarg) { - syslog(LOG_INFO, "reading flag ants file %s",optarg); - if (sscanf (optarg, "%s", &fflagants) != 1) { - syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg); - return EXIT_FAILURE; - } - break; - } else { - syslog(LOG_ERR,"-f flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 's': - if (optarg) { - sfreq = atof(optarg); - syslog(LOG_INFO, "start freq %g",sfreq); - break; - } - else { - syslog(LOG_ERR,"-s flag requires argument"); - usage(); - return EXIT_FAILURE; - } - case 'd': - syslog (LOG_DEBUG, "Will excrete all debug messages"); - break; - case 'h': - usage(); - return EXIT_SUCCESS; - } - } std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl; std::cout << "NCHAN = " << NCHAN << std::endl; @@ -159,35 +113,104 @@ int main(int argc, char **argv) { //dsaX_init(); FILE *fin, *fout; - std::cout << "Creating float output_array of size " << sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl; - uint64_t output_size = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4; - std::cout << "Creating char input_array of size " << sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 << std::endl; - uint64_t input_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + uint64_t sz, output_size, in_block_size, rd_size; + in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + char * output_data, * o1; + int nreps = 1, nchunks = 1; - float *output_data = (float *)malloc(output_size); - char *input_data = (char *)malloc(input_size); + // read one block of input data + // get size of file + std::cout << "attempting to read file " << test_filename.c_str() << std::endl; + fin=fopen(test_filename.c_str(), "rb"); + fseek(fin, 0L, SEEK_END); + sz = ftell(fin); + rewind(fin); + + // figure out how many reps and chunks to read with + if (sz > in_block_size) { + nreps = (int)(sz/in_block_size); + rd_size = in_block_size; + } + else { + nchunks = (int)(in_block_size/sz); + rd_size = sz; + } + + std::cout << "Creating char input_array of size " << sizeof(char)*in_block_size << std::endl; + char *input_data = (char *)malloc(in_block_size); + + // Loop over reps and chunks + for (int reps = 0; reps0) rewind(fin); + fread(input_data + chunks*rd_size, rd_size, 1, fin); + + std::cout << "Input peek " << std::endl; + //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i); + + std::cout << "Creating char output_array of size " << sizeof(char)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl; + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + output_data = (char *)malloc(output_size); + + // run correlator and record output data + syslog(LOG_INFO,"run correlator"); + dsaXCorrelator((void*)output_data, (void*)input_data); + + std::cout << "Output peek " << std::endl; + for(int i=0; i Date: Thu, 27 Jun 2024 21:56:35 -0700 Subject: [PATCH 24/30] split dmem into dmem_corr and dmem_bf, add metrics structure, abstract some kernel indexing in preparation for tuning functionality --- include/dsaX.h | 52 ++++++++++-- include/dsaX_cuda_interface.h | 14 ++-- include/dsaX_cuda_kernels.h | 35 +++++--- include/dsaX_ftd.h | 2 +- include/dsaX_interface.h | 10 ++- src/dsaX_beamformer.cpp | 2 +- src/dsaX_correlator.cpp | 6 +- src/dsaX_cuda_interface.cu | 147 +++++++++++++++++----------------- src/dsaX_interface.cpp | 16 ++-- 9 files changed, 173 insertions(+), 111 deletions(-) diff --git a/include/dsaX.h b/include/dsaX.h index 6083bb2..96a645f 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -40,6 +40,16 @@ typedef struct dsaXBLASParam_s { } dsaXBLASParam; +// Structure that carries BLAS parameters +typedef struct dsaXCorrParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} dsaXCorrParam; + void printDsaXBLASParam(const dsaXBLASParam param); // required to prevent overflow in corr matrix multiply @@ -48,8 +58,32 @@ void printDsaXBLASParam(const dsaXBLASParam param); // beam sep #define sep 1.0 // arcmin -// define structure that carries around device memory pointers -typedef struct dmem { +// Global timing and metrics structure for dsaX +typedef struct metrics_s { + + // Mem copy times + double mem_copy_time_H2H; + double mem_copy_time_H2D; + double mem_copy_time_D2H; + double mem_copy_time_D2D; + + // Mem copy size + double mem_copy_size_H2H; + double mem_copy_size_H2D; + double mem_copy_size_D2H; + double mem_copy_size_D2D; + + // Compute + double compute_time; + double compute_flops; + + // Initialisation + double initialisation_time; +} metrics; + +// define structure that carries around memory pointers +// and timer for the correlator +typedef struct dmem_corr_s { // initial data and streams char *h_input; // host input pointer @@ -63,7 +97,13 @@ typedef struct dmem { // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] float *d_output; +} dmem_corr; + +typedef struct dmem_bf_s { + // beamformer pointers + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] char *d_big_input; void *d_br, *d_bi; //half void *weights_r, *weights_i; //weights: [arm, tactp, b] //half @@ -78,7 +118,9 @@ typedef struct dmem { // timing float cp, prep, cubl, outp; -} dmem; +} dmem_bf; + + void dsaXInit(int device_ordinal = 0); @@ -86,5 +128,5 @@ void inspectPackedData(char input, int i, bool non_zero = false); void dsaXCorrelator(void *output_data, void *input_data); -void reorderOutput(dmem *d); -void reorderInput(dmem *d); +void reorderCorrelatorOutput(dmem_corr *d); +void reorderCorrelatorInput(dmem_corr *d); diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h index d9f2278..54e2609 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/dsaX_cuda_interface.h @@ -8,9 +8,13 @@ void dsaXInitCuda(int dev); -void initializeCudaMemory(dmem *d, int bf); +void initializeCorrCudaMemory(dmem_corr *d); -void deallocateCudaMemory(dmem *d, int bf); +void initializeBFCudaMemory(dmem_bf *d); + +void deallocateCorrCudaMemory(dmem_corr *d); + +void deallocateBFCudaMemory(dmem_bf *d); void dsaXmemsetCuda(void *array, int ch, size_t n); @@ -18,11 +22,11 @@ void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKi void dsaXDeviceSynchronizeCuda(); -void reorderOutputCuda(dmem *d); +void reorderCorrOutputCuda(dmem_corr *d); -void calcWeightsCuda(dmem *d); +void reorderCorrInputCuda(dmem_corr *d); -void reorderInputCuda(dmem *d); +void calcWeightsCuda(dmem_bf *d); template void transposeMatrixCuda(in_prec *idata, out_prec *odata); diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h index 7fef077..0c2cb7c 100644 --- a/include/dsaX_cuda_kernels.h +++ b/include/dsaX_cuda_kernels.h @@ -97,6 +97,7 @@ template __global__ void transpose_matrix( } +// DMH: TUNABLE // transpose kernel // assume breakdown into tiles of 32x32, and run with 32x8 threads per block // launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) @@ -104,32 +105,40 @@ template __global__ void transpose_matrix( __global__ void transpose_matrix_char(char * idata, char * odata) { __shared__ char tile[32][33]; + //extern __shared__ char tile[]; - int x = blockIdx.x * 32 + threadIdx.x; - int y = blockIdx.y * 32 + threadIdx.y; - int width = gridDim.x * 32; + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.x + threadIdx.y; + int width = gridDim.x * blockDim.x; - for (int j = 0; j < 32; j += 8) { + for (int j = 0; j < blockDim.x; j += blockDim.y) { tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //tile[(threadIdx.y+j)*blockDim.x + threadIdx.x] = idata[(y+j)*width + x]; //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); } __syncthreads(); - x = blockIdx.y * 32 + threadIdx.x; // transpose block offset - y = blockIdx.x * 32 + threadIdx.y; - width = gridDim.y * 32; + x = blockIdx.y * blockDim.x + threadIdx.x; // transpose block offset + y = blockIdx.x * blockDim.x + threadIdx.y; + width = gridDim.y * blockDim.x; - for (int j = 0; j < 32; j += 8) { - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + for (int j = 0; j < blockDim.x; j += blockDim.y) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)]; } } -// kernel to fluff input -// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks -__global__ void corr_input_copy(char *input, half *inr, half *ini) { - +/** + * Promote complex char riri... data to planar half rr.. ii.. + * + * @param[out] inr Half precision real array + * @param[out] ini Half precision imag array + * @param[in] input Char precision complex array + */ +__global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini) { + int bidx = blockIdx.x; int tidx = threadIdx.x; int iidx = blockDim.x * bidx + tidx; diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h index f7363f1..47b562e 100644 --- a/include/dsaX_ftd.h +++ b/include/dsaX_ftd.h @@ -2,4 +2,4 @@ #include "dsaX.h" -void dcorrelator(dmem *d); +void dcorrelator(dmem_corr *d); diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h index 06a2364..18ed9f0 100644 --- a/include/dsaX_interface.h +++ b/include/dsaX_interface.h @@ -4,9 +4,15 @@ // DMH: decorate these with Doxygen void dsaXCorrelator(void *input_data, void *output_data); -void reorderInput(dmem *d); -void reorderOutput(dmem *d); + +void reorderCorrInput(dmem_corr *d); + +void reorderCorrOutput(dmem_corr *d); + void transposeInputBeamformer(double *input, double *output, std::vector &dimBlock, std::vector &dimGrid); + void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector &dimBlock, std::vector &dimGrid); + void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb); + void sumBeam(unsigned char *input, float *output, int blocks, int tpb); diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp index f82f677..61fbc5d 100644 --- a/src/dsaX_beamformer.cpp +++ b/src/dsaX_beamformer.cpp @@ -29,7 +29,7 @@ using namespace std; */ // beamformer function -void dbeamformer(dmem *d) { +void dbeamformer(dmem_bf *d) { dsaXBLASParam blas_param; blas_param.trans_a = DSA_BLAS_OP_T; diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp index 7a6882c..2d179d1 100644 --- a/src/dsaX_correlator.cpp +++ b/src/dsaX_correlator.cpp @@ -17,7 +17,7 @@ Workflow is similar for BF and corr applications // correlator function // workflow: copy to device, reorder, stridedBatchedGemm, reorder // DMH CUDA references excised. -void dcorrelator(dmem *d) { +void dcorrelator(dmem_corr *d) { // zero out output arrays dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short @@ -28,7 +28,7 @@ void dcorrelator(dmem *d) { dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); // reorder input into real and imaginary arrays of 2 byte data - reorderInput(d); + reorderCorrInput(d); dsaXBLASParam blas_param; blas_param.struct_size = sizeof(blas_param); @@ -105,5 +105,5 @@ void dcorrelator(dmem *d) { //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i); // reorder output data - reorderOutput(d); + reorderCorrOutput(d); } diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index 1782752..0046b1e 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -12,89 +12,90 @@ void dsaXInitCuda(int dev){ } // allocate device memory -void initializeCudaMemory(dmem *d, int bf) { +void initializeCorrCudaMemory(dmem_corr *d) { // for correlator - if (bf==0) { - cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - } + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); +} +void initializeBFCudaMemory(dmem_bf *d) { + // for beamformer - if (bf==1) { - cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); - cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor - cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor - - // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] - d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); - d->flagants = (int *)malloc(sizeof(int)*NANTS); - d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); - - // timers - d->cp = 0.; - d->prep = 0.; - d->outp = 0.; - d->cubl = 0.; - - } + cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); + cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); + cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); + cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); + cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor + cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor + + // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] + d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); + d->flagants = (int *)malloc(sizeof(int)*NANTS); + d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8)); + cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8)); + + // timers + d->cp = 0.; + d->prep = 0.; + d->outp = 0.; + d->cubl = 0.; } + // deallocate device memory -void deallocateCudaMemory(dmem *d, int bf) { +void deallocateCorrCudaMemory(dmem_corr *d) { cudaFree(d->d_input); - - if (bf==0) { - cudaFree(d->d_r); - cudaFree(d->d_i); - cudaFree(d->d_tx); - cudaFree(d->d_output); - cudaFree(d->d_outr); - cudaFree(d->d_outi); - cudaFree(d->d_tx_outr); - cudaFree(d->d_tx_outi); - } - if (bf==1) { - cudaFree(d->d_tx); - cudaFree(d->d_br); - cudaFree(d->d_bi); - cudaFree(d->weights_r); - cudaFree(d->weights_i); - cudaFree(d->d_bigbeam_r); - cudaFree(d->d_bigbeam_i); - cudaFree(d->d_bigpower); - cudaFree(d->d_scf); - cudaFree(d->d_chscf); - free(d->h_winp); - free(d->flagants); - cudaFree(d->d_freqs); - free(d->h_freqs); - } + cudaFree(d->d_r); + cudaFree(d->d_i); + cudaFree(d->d_tx); + cudaFree(d->d_output); + cudaFree(d->d_outr); + cudaFree(d->d_outi); + cudaFree(d->d_tx_outr); + cudaFree(d->d_tx_outi); } +// deallocate device memory +void deallocateBFCudaMemory(dmem_bf *d) { + + cudaFree(d->d_input); + cudaFree(d->d_tx); + cudaFree(d->d_br); + cudaFree(d->d_bi); + cudaFree(d->weights_r); + cudaFree(d->weights_i); + cudaFree(d->d_bigbeam_r); + cudaFree(d->d_bigbeam_i); + cudaFree(d->d_bigpower); + cudaFree(d->d_scf); + cudaFree(d->d_chscf); + free(d->h_winp); + free(d->flagants); + cudaFree(d->d_freqs); + free(d->h_freqs); +} + + // function to copy d_outr and d_outi to d_output // inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS] // the corr matrices are column major order // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel -void reorderOutputCuda(dmem * d) { +void reorderCorrOutputCuda(dmem_corr * d) { // transpose input data #if defined (OLD_BLAS) @@ -181,19 +182,19 @@ void reorderOutputCuda(dmem * d) { // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. // then fluffs using simple kernel -void reorderInputCuda(dmem *d) { +void reorderCorrInputCuda(dmem_corr *d) { // transpose input data #if defined (OLD_BLAS) - dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); + dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); // TUNABLE int blockDim = 128; int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; transpose_matrix_char<<>>(d->d_input, d->d_tx); - corr_input_copy<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); + promoteComplexCharToPlanarHalf<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); #else - corr_input_copy<<>>(d->d_input, (half*)d->d_r, (half*)d->d_i); + promoteComplexCharToPlanarHalf<<>>(d->d_input, (half*)d->d_r, (half*)d->d_i); #endif } @@ -222,7 +223,7 @@ void transposeInputBeamformerCuda(double *idata, double *odata, std::vector // sequential pairs of eastings and northings // then [NANTS, 48, R/I] calibs -void calcWeightsCuda(dmem *d) { +void calcWeightsCuda(dmem_bf *d) { // allocate float *antpos_e = (float *)malloc(sizeof(float)*NANTS); diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp index f17a6d8..0c88ee0 100644 --- a/src/dsaX_interface.cpp +++ b/src/dsaX_interface.cpp @@ -53,30 +53,30 @@ void inspectPackedData(char input, int i, bool non_zeros) { void dsaXCorrelator(void *output_data, void *input_data) { - dmem d; - int bf = 0; + dmem_corr d; #if DSA_XENGINE_TARGET_CUDA - initializeCudaMemory(&d, bf); + initializeCorrCudaMemory(&d); d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); dcorrelator(&d); - dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost); + dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost); + deallocateCorrCudaMemory(&d); #else std::cout << "dsaX error: not implemented" << std::endl; #endif } -void reorderInput(dmem *d) { +void reorderCorrInput(dmem_corr *d) { #if DSA_XENGINE_TARGET_CUDA - reorderInputCuda(d); + reorderCorrInputCuda(d); #else std::cout << "dsaX error: not implemented" << std::endl; #endif } -void reorderOutput(dmem *d) { +void reorderCorrOutput(dmem_corr *d) { #if DSA_XENGINE_TARGET_CUDA - reorderOutputCuda(d); + reorderCorrOutputCuda(d); #else std::cout << "dsaX error: not implemented" << std::endl; #endif From a7ce185dfae584eeb30d9c1d33d58b3ad4692dd8 Mon Sep 17 00:00:00 2001 From: cpviolator Date: Fri, 28 Jun 2024 22:19:05 -0700 Subject: [PATCH 25/30] Created a Correlator class to allow for persistent memory, added missing CLI files, added timer dependency, cleaned up header tree, added enhanced parameter handling, cleaner test script --- CMakeLists.txt | 97 ++++++++++++++++++-------- include/dsaX.h | 123 +++------------------------------ include/dsaX_enums.h | 9 ++- include/dsaX_ftd.h | 76 +++++++++++++++++++- include/dsaX_interface.h | 1 + include/dsaX_params.h | 83 ++++++++++++++++++++++ include/dsaX_utils.h | 2 +- src/CMakeLists.txt | 23 +++--- src/dsaX_correlator.cpp | 112 ++++++++++++++++++++++++++++-- src/dsaX_cublas_interface.cu | 2 + src/dsaX_cuda_interface.cu | 8 +-- src/dsaX_interface.cpp | 43 +++++------- src/dsaX_magma_interface.cu | 2 + src/dsaX_params.cpp | 102 +++++++++++++++++++++++++++ src/dsaX_utils.cpp | 25 ++++++- tests/CMakeLists.txt | 2 +- tests/command_line_params.cpp | 6 +- tests/command_line_params.h | 35 ++++++++++ tests/dsaX_correlator_test.cpp | 85 ++++++++++++----------- 19 files changed, 595 insertions(+), 241 deletions(-) create mode 100644 include/dsaX_params.h create mode 100644 src/dsaX_params.cpp create mode 100644 tests/command_line_params.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e3cf1b0..a5a2333 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,6 @@ if(TARGET_TYPE_VALID LESS 0) message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") endif() - # Git helpers #------------ find_package(Git) @@ -231,7 +230,7 @@ if(DSA_XENGINE_ENABLE_OPENBLAS) endif() # Get psrdada dependency -option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for correlatorss" ON) +option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for IO" ON) option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON) if(DSA_XENGINE_DOWNLOAD_PSRDADA) # Download, build and install @@ -245,40 +244,80 @@ else() find_package(PSRDada REQUIRED) endif() +# Get HDF5 dependency +option(DSA_XENGINE_ENABLE_HDF5 "Use HDF5 for data IO" OFF) +if(DSA_XENGINE_ENABLE_HDF5) + option(DSA_XENGINE_DOWNLOAD_HDF5 "Download and build HDf5" OFF) + if(DSA_XENGINE_DOWNLOAD_HDF5) + # Download, build and install + FetchContent_Declare( + HDF5 + GIT_REPOSITORY https://github.com/HDFGroup/hdf5.git + GIT_TAG 5794814 + ) + FetchContent_MakeAvailable(HDF5) + else() + # Find and link to local install + find_package(HDF5 REQUIRED) + endif() +endif() + # Get CLI11 dependency # FIX ME: get static .hpp version and ship with package option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON) -option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON) -if(DSA_XENGINE_DOWNLOAD_CLI11) - # Download, build and install - FetchContent_Declare( - CLI11 - GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git - GIT_TAG main - ) - FetchContent_MakeAvailable(CLI11) -else() - # Find and link to local install - find_package(CLI11 REQUIRED) +if(DSA_XENGINE_ENABLE_CLI11) + option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON) + if(DSA_XENGINE_DOWNLOAD_CLI11) + # Download, build and install + FetchContent_Declare( + CLI11 + GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git + GIT_TAG main + ) + FetchContent_MakeAvailable(CLI11) + else() + # Find and link to local install + find_package(CLI11 REQUIRED) + endif() endif() -# Get CLI11 dependency -# FIX ME: get static .hpp version and ship with package -option(DSA_XENGINE_ENABLE_GOOGLETEST "Enable GOOGLETEST (required)" ON) -option(DSA_XENGINE_DOWNLOAD_GOOGLETEST "Download and build GOOGLETEST" ON) -if(DSA_XENGINE_DOWNLOAD_GOOGLETEST) - # Download, build and install - FetchContent_Declare( - GOOGLETEST - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG main - ) - FetchContent_MakeAvailable(GOOGLETEST) -else() - # Find and link to local install - find_package(GOOGLETEST REQUIRED) + +# Get ZFP dependency +option(DSA_XENGINE_ENABLE_ZFP "Enable ZFP" OFF) +if(DSA_XENGINE_ENABLE_ZFP) + option(DSA_XENGINE_DOWNLOAD_ZFP "Download and build ZFP" OFF) + if(DSA_XENGINE_DOWNLOAD_ZFP) + # Download, build and install + FetchContent_Declare( + ZFP + GIT_REPOSITORY https://github.com/LLNL/zfp.git + GIT_TAG f40868a + ) + FetchContent_MakeAvailable(ZFP) + else() + # Find and link to local install + find_package(ZFP REQUIRED) + endif() endif() +# Get Timer dependency https://github.com/cpp-core/timer.git +# Get timer dependency +option(DSA_XENGINE_ENABLE_TIMER "Enable timer" ON) +if(DSA_XENGINE_ENABLE_TIMER) + option(DSA_XENGINE_DOWNLOAD_TIMER "Download and build timer" ON) + if(DSA_XENGINE_DOWNLOAD_TIMER) + # Download, build and install + FetchContent_Declare( + TIMER + GIT_REPOSITORY https://github.com/cpp-core/timer.git + GIT_TAG main + ) + FetchContent_MakeAvailable(TIMER) + else() + # Find and link to local install + find_package(TIMER REQUIRED) + endif() +endif() # Add src, include, tests, and legacy add_subdirectory(src) diff --git a/include/dsaX.h b/include/dsaX.h index 96a645f..cc3ff5c 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -1,132 +1,25 @@ #pragma once -#include - +// Expose the use to compile time definitions, +// enums, parameters, and classes #include "dsaX_def.h" #include "dsaX_enums.h" +#include "dsaX_params.h" +#include "dsaX_ftd.h" +// Use manual transpose route +// Uncomment to try new pure cuBLAS #define OLD_BLAS -// Structure that carries BLAS parameters -typedef struct dsaXBLASParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ - - dsaXBLASType blas_type; /**< Type of BLAS computation to perfrom */ - - dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ - - // GEMM params - dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ - dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ - int m; /**< number of rows of matrix op(A) and C. */ - int n; /**< number of columns of matrix op(B) and C. */ - int k; /**< number of columns of op(A) and rows of op(B). */ - int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ - int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ - int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ - long long int a_offset; /**< position of the A array from which begin read/write. */ - long long int b_offset; /**< position of the B array from which begin read/write. */ - long long int c_offset; /**< position of the C array from which begin read/write. */ - long long int a_stride; /**< stride of the A array in strided(batched) mode */ - long long int b_stride; /**< stride of the B array in strided(batched) mode */ - long long int c_stride; /**< stride of the C array in strided(batched) mode */ - std::complex alpha; /**< scalar used for multiplication. */ - std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ - - // Common params - int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ - dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ - dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ - -} dsaXBLASParam; - -// Structure that carries BLAS parameters -typedef struct dsaXCorrParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ - - dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ - dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ - dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ - -} dsaXCorrParam; - -void printDsaXBLASParam(const dsaXBLASParam param); - // required to prevent overflow in corr matrix multiply #define halfFac 4 // beam sep #define sep 1.0 // arcmin -// Global timing and metrics structure for dsaX -typedef struct metrics_s { - - // Mem copy times - double mem_copy_time_H2H; - double mem_copy_time_H2D; - double mem_copy_time_D2H; - double mem_copy_time_D2D; - - // Mem copy size - double mem_copy_size_H2H; - double mem_copy_size_H2D; - double mem_copy_size_D2H; - double mem_copy_size_D2D; - - // Compute - double compute_time; - double compute_flops; - - // Initialisation - double initialisation_time; -} metrics; - -// define structure that carries around memory pointers -// and timer for the correlator -typedef struct dmem_corr_s { - - // initial data and streams - char *h_input; // host input pointer - char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - - // correlator pointers - // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times] - void *d_r, *d_i; //half - // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] - void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half - // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] - float *d_output; - -} dmem_corr; - -typedef struct dmem_bf_s { - - // beamformer pointers - char *h_input; // host input pointer - char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] - char *d_big_input; - void *d_br, *d_bi; //half - void *weights_r, *weights_i; //weights: [arm, tactp, b] //half - void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half - unsigned char *d_bigpower; //output: [b, tc] - float *d_scf; // scale factor per beam - float *d_chscf; - float *h_winp; - int *flagants, nflags; - float *h_freqs, *d_freqs; - - // timing - float cp, prep, cubl, outp; - -} dmem_bf; - - - void dsaXInit(int device_ordinal = 0); +void dsaXEnd(); void inspectPackedData(char input, int i, bool non_zero = false); -void dsaXCorrelator(void *output_data, void *input_data); - -void reorderCorrelatorOutput(dmem_corr *d); -void reorderCorrelatorInput(dmem_corr *d); +void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param); diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h index 4e8351f..9bffca0 100644 --- a/include/dsaX_enums.h +++ b/include/dsaX_enums.h @@ -27,17 +27,22 @@ typedef enum dsaXBLASLib_s { DSA_BLAS_LIB_MAGMA = 1, DSA_BLAS_LIB_CUTLASS = 2, DSA_BLAS_LIB_TCC = 3, - DSA_BLAS_LIB_OPENBLAS = 4, + DSA_BLAS_LIB_OPENBLAS = 4, + DSA_BLAS_LIB_NATIVE = 5, DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM } dsaXBLASLib; -typedef enum dsaXBLASDataLib_s { +typedef enum dsaXBLASDataType_s { DSA_BLAS_DATATYPE_H = 0, // Half DSA_BLAS_DATATYPE_S = 1, // Single DSA_BLAS_DATATYPE_D = 2, // Double DSA_BLAS_DATATYPE_HC = 3, // Complex(half) DSA_BLAS_DATATYPE_C = 4, // Complex(single) DSA_BLAS_DATATYPE_Z = 5, // Complex(double) + DSA_BLAS_DATATYPE_4b_REAL = 6, // 4b sized real + DSA_BLAS_DATATYPE_2b_REAL = 7, // 2b sized real + DSA_BLAS_DATATYPE_4b_COMPLEX = 8, // Char sized complex (4b,4b) + DSA_BLAS_DATATYPE_2b_COMPLEX = 9, // 4b sized (2b,2b) DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM } dsaXBLASDataType; diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h index 47b562e..9c35043 100644 --- a/include/dsaX_ftd.h +++ b/include/dsaX_ftd.h @@ -1,5 +1,79 @@ #pragma once -#include "dsaX.h" +//#include "dsaX_def.h" +#include "dsaX_enums.h" +#include "dsaX_params.h" + +// define structures that carry around memory pointers +// and metric. +// DMH: make a base and inherit into corr and bf +typedef struct dmem_corr_s { + + // initial data and streams + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + + // correlator pointers + // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times] + void *d_r, *d_i; //half + // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS] + void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half + // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] + float *d_output; + + metrics metric_data; + +} dmem_corr; + +typedef struct dmem_bf_s { + + // beamformer pointers + char *h_input; // host input pointer + char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + char *d_big_input; + void *d_br, *d_bi; //half + void *weights_r, *weights_i; //weights: [arm, tactp, b] //half + void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half + unsigned char *d_bigpower; //output: [b, tc] + float *d_scf; // scale factor per beam + float *d_chscf; + float *h_winp; + int *flagants, nflags; + float *h_freqs, *d_freqs; + + // timing (old) + float cp, prep, cubl, outp; + metrics metric_data; + +} dmem_bf; void dcorrelator(dmem_corr *d); + +class Correlator { + +private: +protected: + + dmem_corr d; + dsaXCorrParam corr_param; + dsaXBLASParam blas_param; + +public: + + // Constructor + // Initialise device memory if CUDA enabled + // make host memory if CPU + Correlator(const dsaXCorrParam *corr_param); + + // Compute the FX correlator on input, + // place result in output. + void compute(void *output, void *input); + + ~Correlator(); +}; + +void destroyDsaXCorrDeviceMemory(dmem_corr *d); +void initDsaXCorrDeviceMemory(dmem_corr *d); + +void reorderCorrelatorOutput(dmem_corr *d); +void reorderCorrelatorInput(dmem_corr *d); diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h index 18ed9f0..a98215e 100644 --- a/include/dsaX_interface.h +++ b/include/dsaX_interface.h @@ -1,6 +1,7 @@ #pragma once #include +#include "dsaX.h" // DMH: decorate these with Doxygen void dsaXCorrelator(void *input_data, void *output_data); diff --git a/include/dsaX_params.h b/include/dsaX_params.h new file mode 100644 index 0000000..bf5f455 --- /dev/null +++ b/include/dsaX_params.h @@ -0,0 +1,83 @@ +#pragma once + +#include + +#include "dsaX_enums.h" + +// Structure that carries BLAS parameters +// This should be able to communicate to all +// backend choices of BLAS library +typedef struct dsaXBLASParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaXBLASType blas_type; /**< Type of BLAS computation to perform */ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ + + // GEMM params + dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + int m; /**< number of rows of matrix op(A) and C. */ + int n; /**< number of columns of matrix op(B) and C. */ + int k; /**< number of columns of op(A) and rows of op(B). */ + int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ + int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ + int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ + long long int a_offset; /**< position of the A array from which begin read/write. */ + long long int b_offset; /**< position of the B array from which begin read/write. */ + long long int c_offset; /**< position of the C array from which begin read/write. */ + long long int a_stride; /**< stride of the A array in strided(batched) mode */ + long long int b_stride; /**< stride of the B array in strided(batched) mode */ + long long int c_stride; /**< stride of the C array in strided(batched) mode */ + std::complex alpha; /**< scalar used for multiplication. */ + std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + + // Common params + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} dsaXBLASParam; + +// Structure that carries Correlator class parameters +typedef struct dsaXCorrParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + + dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ + dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} dsaXCorrParam; + +// Global timing and metrics structure for dsaX +typedef struct metrics_s { + + // Mem copy times + double mem_copy_time_H2H; + double mem_copy_time_H2D; + double mem_copy_time_D2H; + double mem_copy_time_D2D; + + // Mem copy size + double mem_copy_size_H2H; + double mem_copy_size_H2D; + double mem_copy_size_D2H; + double mem_copy_size_D2D; + + // Compute + double compute_time; + double compute_flops; + + // Initialisation + double initialisation_time; +} metrics; + +// Parameter struct helper functions for user +const char *getBLASLibString(dsaXBLASLib lib); +const char *getBLASDataTypeString(dsaXBLASDataType type); +const char *getBLASDataOrderString(dsaXBLASDataOrder order); +void printDsaXBLASParam(const dsaXBLASParam param); +void printDsaXCorrParam(const dsaXCorrParam param); + +// Create params +dsaXCorrParam newDsaXCorrParam(void); diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h index f2dbc0c..fa22abe 100644 --- a/include/dsaX_utils.h +++ b/include/dsaX_utils.h @@ -1,6 +1,6 @@ #pragma once -#include "dsaX.h" +#include "dsaX_params.h" void dsaXmemset(void *array, int ch, size_t n); void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aaacfa5..f885512 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,6 +19,7 @@ set(DSAX_OBJS dsaX_correlator.cpp dsaX_interface.cpp dsaX_utils.cpp + dsaX_params.cpp dsaX_psrdada_utils.cpp ) @@ -47,41 +48,41 @@ if(GITVERSION) endif() mark_as_advanced(DSAX_GITDIR) -# generate a cmake object library for all cpp files first +# generate a cmake object library for all cpp files first add_library(dsax_cpp OBJECT ${DSAX_OBJS}) if(DSA_XENGINE_BUILD_SHAREDLIB) set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE) - add_library(dsax SHARED) + add_library(dsaX SHARED) else() - add_library(dsax STATIC) + add_library(dsaX STATIC) endif() -add_library(DSA_XENGINE::dsax ALIAS dsax) +add_library(DSA_XENGINE::dsaX ALIAS dsaX) -# make one library -target_sources(dsax PRIVATE $ ${DSAX_CU_OBJS}) +# make one library +target_sources(dsaX PRIVATE $ ${DSAX_CU_OBJS}) if(CUDAToolkit_FOUND) - target_link_libraries(dsax INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY}) + target_link_libraries(dsaX INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY}) endif() if(DSA_XENGINE_ENABLE_PSRDADA) include_directories(${PSRDada_SOURCE_DIR}/src) set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so) - target_link_libraries(dsax PUBLIC ${PSRDada_LIB}) + target_link_libraries(dsaX PUBLIC ${PSRDada_LIB}) endif() if(DSA_XENGINE_ENABLE_XGPU) include_directories(${xGPU_SOURCE_DIR}/src) set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a) - target_link_libraries(dsax PUBLIC ${XGPU_LIB}) + target_link_libraries(dsaX PUBLIC ${XGPU_LIB}) endif() if(DSA_XENGINE_ENABLE_CUTLASS) include_directories(${NvidiaCutlass_DIR}/../../../include) include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util) set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so) - target_link_libraries(dsax PUBLIC ${NvidiaCutlass_LIB}) + target_link_libraries(dsaX PUBLIC ${NvidiaCutlass_LIB}) # Some simple CUTLASS examples to test linking/benching #------------------------------------------------------ @@ -107,7 +108,7 @@ endif() #----------------------------- install(TARGETS # cmake-format: sortable - dsax + dsaX LIBRARY DESTINATION lib ) diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp index 2d179d1..fecc184 100644 --- a/src/dsaX_correlator.cpp +++ b/src/dsaX_correlator.cpp @@ -10,15 +10,115 @@ Workflow is similar for BF and corr applications #include "dsaX_def.h" #include "dsaX.h" +#include "dsaX_ftd.h" #include "dsaX_blas_interface.h" #include "dsaX_utils.h" #include "dsaX_psrdada_utils.h" +Correlator::Correlator(const dsaXCorrParam *param) { + + // Transfer passed param to internal objects + corr_param = *param; + //printDsaXCorrParam(corr_param); + + // Select back end BLAS engine + blas_param.struct_size = sizeof(blas_param); + blas_param.blas_type = DSA_BLAS_GEMM; + blas_param.blas_lib = corr_param.blas_lib; + + // Initialise device memeory + initDsaXCorrDeviceMemory(&d); + + // gemm settings + // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] + // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] +#if defined OLD_BLAS + //std::cout << "Old params" << std::endl; + blas_param.data_order = DSA_BLAS_DATAORDER_COL; + blas_param.trans_a = DSA_BLAS_OP_A; + blas_param.trans_b = DSA_BLAS_OP_T; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#else + //std::cout << "My params" << std::endl; + blas_param.data_order = DSA_BLAS_DATAORDER_ROW; + blas_param.trans_a = DSA_BLAS_OP_C; + blas_param.trans_b = DSA_BLAS_OP_N; + blas_param.m = NANTS; + blas_param.n = NANTS; + blas_param.k = NPACKETS_PER_BLOCK/halfFac; + blas_param.alpha = 1.0; + blas_param.lda = blas_param.m; + blas_param.ldb = blas_param.n; + blas_param.beta = 0.; + blas_param.ldc = blas_param.m; + blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;; + blas_param.c_stride = NANTS*NANTS; + blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac; + blas_param.a_offset = 0; + blas_param.b_offset = 0; + blas_param.c_offset = 0; +#endif + + // Swap A and B if in row order + if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { + std::swap(blas_param.m, blas_param.n); + std::swap(blas_param.lda, blas_param.ldb); + std::swap(blas_param.trans_a, blas_param.trans_b); + std::swap(blas_param.a_offset, blas_param.b_offset); + std::swap(blas_param.a_stride, blas_param.b_stride); + //std::swap(A_data, B_data); + //std::swap(A_data, B_data); + } +} + +Correlator::~Correlator() { + destroyDsaXCorrDeviceMemory(&d); +} + +void Correlator::compute(void *output, void *input) { + + // zero out output arrays + dsaXmemset(d.d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short + dsaXmemset(d.d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short + dsaXmemset(d.d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + + // copy to device + dsaXmemcpy(d.d_input, input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); + + // reorder input into real and imaginary arrays of half + reorderCorrInput(&d); + + // Perform GEMM accoring to back end configuration + dsaXHgemmStridedBatched(d.d_r, d.d_i, d.d_r, d.d_i, d.d_outr, d.d_outi, blas_param); + + // reorder output data + reorderCorrOutput(&d); + + // Pass result back to host + dsaXmemcpy(output, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost); +} + + // correlator function -// workflow: copy to device, reorder, stridedBatchedGemm, reorder -// DMH CUDA references excised. +// workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host +// DMH: CUDA references excised. Make me a class void dcorrelator(dmem_corr *d) { - + // zero out output arrays dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short @@ -39,7 +139,7 @@ void dcorrelator(dmem_corr *d) { // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] #if defined OLD_BLAS - std::cout << "Old params" << std::endl; + //std::cout << "Old params" << std::endl; blas_param.data_order = DSA_BLAS_DATAORDER_COL; blas_param.trans_a = DSA_BLAS_OP_A; @@ -60,7 +160,7 @@ void dcorrelator(dmem_corr *d) { blas_param.b_offset = 0; blas_param.c_offset = 0; #else - std::cout << "My params" << std::endl; + //std::cout << "My params" << std::endl; blas_param.data_order = DSA_BLAS_DATAORDER_ROW; blas_param.trans_a = DSA_BLAS_OP_C; @@ -94,7 +194,7 @@ void dcorrelator(dmem_corr *d) { } - printDsaXBLASParam(blas_param); + //printDsaXBLASParam(blas_param); // DMH: fix me blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS; diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu index 597cfbd..0bffaea 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/dsaX_cublas_interface.cu @@ -1,5 +1,7 @@ #include + #include "dsaX.h" +#include "dsaX_params.h" #include "dsaX_cuda_headers.h" using namespace std; diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index 0046b1e..ec54675 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -319,6 +319,10 @@ void dsaXmemsetCuda(void *array, int ch, size_t n){ cudaMemset(array, ch, n); } +void dsaXDeviceSynchronizeCuda() { + cudaDeviceSynchronize(); +} + void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ cudaError error = cudaSuccess; switch(kind) { @@ -340,7 +344,3 @@ void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind ki if(error != cudaSuccess) cudaGetLastError(); } -void dsaXDeviceSynchronizeCuda() { - cudaDeviceSynchronize(); -} - diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp index 0c88ee0..6358df1 100644 --- a/src/dsaX_interface.cpp +++ b/src/dsaX_interface.cpp @@ -1,42 +1,33 @@ #include #include #include +#include +#include "dsaX_params.h" #include "dsaX_cuda_interface.h" #include "dsaX_utils.h" #include "dsaX_ftd.h" using namespace std; -void printDsaXBLASParam(const dsaXBLASParam param) { - - cout << "struct_size = " << param.struct_size << endl; - cout << "blas_type = " << param.blas_type << endl; - cout << "blas_lib = " << param.blas_lib << endl; - cout << "data_order = " << param.data_order << endl; - cout << "trans_a = " << param.trans_a << endl; - cout << "trans_b = " << param.trans_b << endl; - cout << "m = " << param.m << endl; - cout << "n = " << param.n << endl; - cout << "k = " << param.k << endl; - cout << "lda = " << param.lda << endl; - cout << "ldb = " << param.ldb << endl; - cout << "ldc = " << param.ldc << endl; - cout << "a_offset = " << param.a_offset << endl; - cout << "b_offset = " << param.b_offset << endl; - cout << "c_offset = " << param.c_offset << endl; - cout << "a_stride = " << param.a_stride << endl; - cout << "b_stride = " << param.b_stride << endl; - cout << "c_stride = " << param.c_stride << endl; - cout << "alpha = " << param.alpha << endl; - cout << "bets = " << param.alpha << endl; - cout << "batch_count = " << param.batch_count << endl; -} void dsaXInit(int dev){ #if DSA_XENGINE_TARGET_CUDA dsaXInitCuda(dev); #endif + + std::cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl; + std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl; + std::cout << "NCHAN = " << NCHAN << std::endl; + std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl; + std::cout << "NPOL = " << NPOL << std::endl; + std::cout << "NARM = " << 3 << std::endl; + std::cout << " --- End dsaX configuration --- " << endl; + //DMH: Add more (ask Vikram) +} + +void dsaXEnd() { + // output metrics } void inspectPackedData(char input, int i, bool non_zeros) { @@ -51,10 +42,10 @@ void inspectPackedData(char input, int i, bool non_zeros) { } } -void dsaXCorrelator(void *output_data, void *input_data) { +void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param) { dmem_corr d; -#if DSA_XENGINE_TARGET_CUDA +#if DSA_XENGINE_TARGET_CUDA initializeCorrCudaMemory(&d); d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu index 14a8f4f..eabfdbf 100644 --- a/src/dsaX_magma_interface.cu +++ b/src/dsaX_magma_interface.cu @@ -1,5 +1,7 @@ #include + #include "dsaX.h" +#include "dsaX_params.h" #include "dsaX_cuda_headers.h" #include "dsaX_magma_headers.h" diff --git a/src/dsaX_params.cpp b/src/dsaX_params.cpp new file mode 100644 index 0000000..7ed0d5b --- /dev/null +++ b/src/dsaX_params.cpp @@ -0,0 +1,102 @@ +#include + +#include "dsaX_params.h" + +using namespace std; + +const char *getBLASLibString(dsaXBLASLib lib) +{ + const char *ret; + + switch (lib) { + case DSA_BLAS_LIB_CUBLAS: ret = "CUBLAS"; break; + case DSA_BLAS_LIB_MAGMA: ret = "MAGMA"; break; + case DSA_BLAS_LIB_CUTLASS: ret = "CUTLAS"; break; + case DSA_BLAS_LIB_OPENBLAS: ret = "OPENBLAS"; break; + case DSA_BLAS_LIB_NATIVE: ret = "NATIVE"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +const char *getBLASDataTypeString(dsaXBLASDataType type) +{ + const char *ret; + + switch (type) { + case DSA_BLAS_DATATYPE_H: ret = "Half"; break; + case DSA_BLAS_DATATYPE_S: ret = "Single"; break; + case DSA_BLAS_DATATYPE_D: ret = "Double"; break; + case DSA_BLAS_DATATYPE_HC: ret = "Complex(half)"; break; + case DSA_BLAS_DATATYPE_C: ret = "Complex(single)"; break; + case DSA_BLAS_DATATYPE_Z: ret = "Complex(double)"; break; + case DSA_BLAS_DATATYPE_4b_REAL: ret = "4b sized real"; break; + case DSA_BLAS_DATATYPE_2b_REAL: ret = "2b sized real"; break; + case DSA_BLAS_DATATYPE_4b_COMPLEX: ret = "Char sized complex (4b,4b)"; break; + case DSA_BLAS_DATATYPE_2b_COMPLEX: ret = "4b sized (2b,2b)"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +const char *getBLASDataOrderString(dsaXBLASDataOrder order) +{ + const char *ret; + + switch (order) { + case DSA_BLAS_DATAORDER_ROW: ret = "Row order"; break; + case DSA_BLAS_DATAORDER_COL: ret = "Column order"; break; + default: ret = "unknown"; break; + } + + return ret; +} + +void printDsaXCorrParam(const dsaXCorrParam param) { + + cout << "--- dsaXCorrParam begin ---" << endl; + cout << "struct_size = " << param.struct_size << endl; + cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; + cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; + cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; + cout << " --- dsaXCorrParam end ---" << endl; +} + +void printDsaXBLASParam(const dsaXBLASParam param) { + + cout << " --- dsaXBLASParam begin ---" << endl; + cout << "struct_size = " << param.struct_size << endl; + cout << "blas_type = " << param.blas_type << endl; + cout << "blas_lib = " << param.blas_lib << endl; + cout << "data_type = " << param.data_type << endl; + cout << "data_order = " << param.data_order << endl; + cout << "trans_a = " << param.trans_a << endl; + cout << "trans_b = " << param.trans_b << endl; + cout << "m = " << param.m << endl; + cout << "n = " << param.n << endl; + cout << "k = " << param.k << endl; + cout << "lda = " << param.lda << endl; + cout << "ldb = " << param.ldb << endl; + cout << "ldc = " << param.ldc << endl; + cout << "a_offset = " << param.a_offset << endl; + cout << "b_offset = " << param.b_offset << endl; + cout << "c_offset = " << param.c_offset << endl; + cout << "a_stride = " << param.a_stride << endl; + cout << "b_stride = " << param.b_stride << endl; + cout << "c_stride = " << param.c_stride << endl; + cout << "alpha = " << param.alpha << endl; + cout << "beta = " << param.beta << endl; + cout << "batch_count = " << param.batch_count << endl; + cout << " --- dsaXBLASParam end ---" << endl; +} + +dsaXCorrParam newDsaXCorrParam(void) { + dsaXCorrParam new_param; + new_param.struct_size = sizeof(new_param); + new_param.blas_lib = DSA_BLAS_LIB_INVALID; + new_param.data_type = DSA_BLAS_DATATYPE_INVALID; + new_param.data_order = DSA_BLAS_DATAORDER_INVALID; + return new_param; +} diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp index 54e849a..3819e98 100644 --- a/src/dsaX_utils.cpp +++ b/src/dsaX_utils.cpp @@ -1,7 +1,12 @@ +#include + #include "dsaX_utils.h" #include "dsaX_enums.h" +#include "dsaX_params.h" #include "dsaX_cuda_interface.h" +using namespace std; + void dsaXmemset(void *array, int ch, size_t n){ #ifdef DSA_XENGINE_TARGET_CUDA dsaXmemsetCuda(array, ch, n); @@ -21,9 +26,27 @@ void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ void dsaXDeviceSynchronize() { #ifdef DSA_XENGINE_TARGET_CUDA - // Perform host to device memcopy on data + // Synchronise the device dsaXDeviceSynchronizeCuda(); #else // NO OP #endif } + +void initDsaXCorrDeviceMemory(dmem_corr *d) { +#ifdef DSA_XENGINE_TARGET_CUDA + initializeCorrCudaMemory(d); +#else + cout << "dsaX Error: Not implemented." << endl; + exit(0); +#endif +} + +void destroyDsaXCorrDeviceMemory(dmem_corr *d) { +#ifdef DSA_XENGINE_TARGET_CUDA + deallocateCorrCudaMemory(d); +#else + cout << "dsaX Error: Not implemented." << endl; + exit(0); +#endif +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3722671..4a93c15 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,4 +5,4 @@ include_directories(${CLI11_SOURCE_DIR}/include/CLI) add_library(dsaX_tests command_line_params.cpp) add_executable(dsaX_correlator_test dsaX_correlator_test.cpp) -target_link_libraries(dsaX_correlator_test dsax dsaX_tests) +target_link_libraries(dsaX_correlator_test dsaX dsaX_tests) diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp index fa48729..69331b3 100644 --- a/tests/command_line_params.cpp +++ b/tests/command_line_params.cpp @@ -14,7 +14,8 @@ bool run_correlator = false; double start_frequency = 1498.75; // Test file -std::string test_filename; +std::string input_filename = "input.dat"; +std::string output_filename = "output.dat"; int n_channels = 384; int n_antennae = 63; int n_pol = 2; @@ -34,7 +35,8 @@ std::shared_ptr make_app(std::string app_description, std::string app_n dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)"); // Input file options - dsaX_app->add_option("--test-filename", test_filename, "Name of file on which to run tests"); + dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests"); + dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results"); dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]"); dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]"); dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]"); diff --git a/tests/command_line_params.h b/tests/command_line_params.h new file mode 100644 index 0000000..06e67ac --- /dev/null +++ b/tests/command_line_params.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include + +class dsaXApp : public CLI::App { + +public: + dsaXApp(std::string app_description = "", std::string app_name = "") : CLI::App(app_description, app_name) {}; + + virtual ~dsaXApp() {}; +}; + +std::shared_ptr make_app(std::string app_description = "dsaX internal test", std::string app_name = ""); + +// General +extern int core; +extern bool debug; + +// Data block HDU keys +extern key_t in_key; +extern key_t out_key; + +// Test mode +extern bool run_beamformer; +extern bool run_correlator; +extern double start_frequency; + +// Test file +extern std::string input_filename; +extern std::string output_filename; +extern int n_channels; +extern int n_antennae; +extern int n_pol; +extern int n_times; diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index bea7afa..2ce7390 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -93,35 +93,16 @@ int main(int argc, char **argv) { return app->exit(e); } - // command line arguments - int device_ordinal = 0; - - std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl; - std::cout << "NCHAN = " << NCHAN << std::endl; - std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl; - std::cout << "NPOL = " << NPOL << std::endl; - std::cout << "NARM = " << 2 << std::endl; - unsigned long long size = sizeof(char); - size *= NPACKETS_PER_BLOCK; - size *= NANTS; - size *= NCHAN_PER_PACKET; - size *= NPOL; - size *= NCOMPLEX; - std::cout << "(bytes) char size * NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX = " << size << std::endl; - std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; - std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl; - - //dsaX_init(); + int device_ordinal = 0; FILE *fin, *fout; uint64_t sz, output_size, in_block_size, rd_size; in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; - char * output_data, * o1; int nreps = 1, nchunks = 1; // read one block of input data // get size of file - std::cout << "attempting to read file " << test_filename.c_str() << std::endl; - fin=fopen(test_filename.c_str(), "rb"); + std::cout << "attempting to read file " << input_filename.c_str() << std::endl; + fin=fopen(input_filename.c_str(), "rb"); fseek(fin, 0L, SEEK_END); sz = ftell(fin); rewind(fin); @@ -136,9 +117,29 @@ int main(int argc, char **argv) { rd_size = sz; } - std::cout << "Creating char input_array of size " << sizeof(char)*in_block_size << std::endl; + // Start dsaX program + //--------------------------------------- + dsaXInit(device_ordinal); + + // Create Correlator class instance. + dsaXCorrParam param = newDsaXCorrParam(); + param.blas_lib = DSA_BLAS_LIB_CUBLAS; + param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX; + param.data_order = DSA_BLAS_DATAORDER_ROW; + printDsaXCorrParam(param); + auto correlator = new Correlator(¶m); + + output_size = NBASE*NCHAN_PER_PACKET*2*2*4; + std::cout << "Creating char output_array of size " << (1.0*sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2)/pow(1024,2) << " MB." << std::endl; + char *output_data = (char *)malloc(output_size); + + std::cout << "Creating char input_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl; char *input_data = (char *)malloc(in_block_size); - + + std::cout << "Computing " << nreps << " repetitions of " << nchunks << " chunks of input data of size " << rd_size << " bytes." << endl; + std::cout << "Total input size = " << (1.0 * nreps * nchunks * rd_size)/pow(1024,3) << " GB." << endl; + std::cout << "Expected output size = " << (1.0 * nreps * nchunks * output_size)/pow(1024,3) << " GB." << endl; + // Loop over reps and chunks for (int reps = 0; reps0) rewind(fin); fread(input_data + chunks*rd_size, rd_size, 1, fin); - std::cout << "Input peek " << std::endl; + //std::cout << "Input peek " << std::endl; //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i); - - std::cout << "Creating char output_array of size " << sizeof(char)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl; - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - output_data = (char *)malloc(output_size); // run correlator and record output data - syslog(LOG_INFO,"run correlator"); - dsaXCorrelator((void*)output_data, (void*)input_data); + //dsaXCorrelator((void*)output_data, (void*)input_data, ¶m); + correlator->compute((void*)output_data, (void*)input_data); - std::cout << "Output peek " << std::endl; - for(int i=0; i Date: Fri, 28 Jun 2024 22:24:14 -0700 Subject: [PATCH 26/30] remove timer download, include header only --- CMakeLists.txt | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5a2333..440c6f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -300,25 +300,6 @@ if(DSA_XENGINE_ENABLE_ZFP) endif() endif() -# Get Timer dependency https://github.com/cpp-core/timer.git -# Get timer dependency -option(DSA_XENGINE_ENABLE_TIMER "Enable timer" ON) -if(DSA_XENGINE_ENABLE_TIMER) - option(DSA_XENGINE_DOWNLOAD_TIMER "Download and build timer" ON) - if(DSA_XENGINE_DOWNLOAD_TIMER) - # Download, build and install - FetchContent_Declare( - TIMER - GIT_REPOSITORY https://github.com/cpp-core/timer.git - GIT_TAG main - ) - FetchContent_MakeAvailable(TIMER) - else() - # Find and link to local install - find_package(TIMER REQUIRED) - endif() -endif() - # Add src, include, tests, and legacy add_subdirectory(src) add_subdirectory(include) From fd273c365f707310ca519c2c338b76bb2030351f Mon Sep 17 00:00:00 2001 From: cpviolator Date: Thu, 4 Jul 2024 17:36:07 -0700 Subject: [PATCH 27/30] Implemented overlapping comms and compute for the Correlator class --- include/CMakeLists.txt | 1 + include/dsaX.h | 8 +- include/dsaX_blas_interface.h | 2 +- include/dsaX_cublas_interface.h | 2 +- include/dsaX_cuda_interface.h | 29 +++- include/dsaX_cuda_kernels.h | 50 ++++++- include/dsaX_enums.h | 4 + include/dsaX_ftd.h | 96 ++++++++++-- include/dsaX_interface.h | 8 +- include/dsaX_params.h | 29 +--- include/dsaX_utils.h | 5 +- src/CMakeLists.txt | 2 + src/dsaX_beamformer.cpp | 2 +- src/dsaX_blas_interface.cpp | 6 +- src/dsaX_correlator.cpp | 155 +++++++++++++++----- src/dsaX_cublas_interface.cu | 79 +++++++--- src/dsaX_cuda_interface.cu | 251 +++++++++++++++++++------------- src/dsaX_interface.cpp | 115 ++++++++++----- src/dsaX_params.cpp | 16 +- src/dsaX_utils.cpp | 19 ++- tests/command_line_params.cpp | 26 +++- tests/command_line_params.h | 9 +- tests/dsaX_correlator_test.cpp | 141 +++++++++++------- 23 files changed, 718 insertions(+), 337 deletions(-) diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index e8ec2d6..58b1566 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -8,6 +8,7 @@ set(DSA_XENGINE_HEADERS dsaX_def.h dsaX_ftd.h dsaX_cuda_interface.h + dsaX_cuda_handles.h dsaX_cuda_headers.h dsaX_capture.h dsaX_capture_manythread.h diff --git a/include/dsaX.h b/include/dsaX.h index cc3ff5c..eab6f75 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -9,7 +9,7 @@ // Use manual transpose route // Uncomment to try new pure cuBLAS -#define OLD_BLAS +//#define OLD_BLAS // required to prevent overflow in corr matrix multiply #define halfFac 4 @@ -17,9 +17,13 @@ // beam sep #define sep 1.0 // arcmin -void dsaXInit(int device_ordinal = 0); +void dsaXInit(int device_ordinal = -1); void dsaXEnd(); +//void dsaX + +void *dsaXHostRegister(size_t size); + void inspectPackedData(char input, int i, bool non_zero = false); void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param); diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h index 49564b5..4c6edaf 100644 --- a/include/dsaX_blas_interface.h +++ b/include/dsaX_blas_interface.h @@ -2,4 +2,4 @@ #include "dsaX_interface.h" -void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0); diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h index 5aea5ef..f68eea3 100644 --- a/include/dsaX_cublas_interface.h +++ b/include/dsaX_cublas_interface.h @@ -1,4 +1,4 @@ #pragma once #include "dsaX.h" -void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param); +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream); diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h index 54e2609..4ad2aed 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/dsaX_cuda_interface.h @@ -7,26 +7,39 @@ #include "dsaX.h" void dsaXInitCuda(int dev); +void dsaXDestroyCuda(int dev); -void initializeCorrCudaMemory(dmem_corr *d); +void initBLASCuda(); +void destroyBLASCuda(); -void initializeBFCudaMemory(dmem_bf *d); +void initStreamsCuda(unsigned int n); +void destroyStreamsCuda(); -void deallocateCorrCudaMemory(dmem_corr *d); +void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream); -void deallocateBFCudaMemory(dmem_bf *d); +void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams); + +void initializeBFCudaMemory(bf_handle *d); + +void deallocateCorrCudaMemory(corr_handle *d); + +void deallocateBFCudaMemory(bf_handle *d); void dsaXmemsetCuda(void *array, int ch, size_t n); -void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind); +void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind, int stream); + +void *dsaXHostRegisterCuda(size_t size); void dsaXDeviceSynchronizeCuda(); -void reorderCorrOutputCuda(dmem_corr *d); +void reorderCorrOutputCuda(corr_handle *d, int stream); + +void computeIndicesCuda(corr_handle *d); -void reorderCorrInputCuda(dmem_corr *d); +void reorderCorrInputCuda(corr_handle *d, int stream); -void calcWeightsCuda(dmem_bf *d); +void calcWeightsCuda(bf_handle *d); template void transposeMatrixCuda(in_prec *idata, out_prec *odata); diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h index 0c2cb7c..49e9ff0 100644 --- a/include/dsaX_cuda_kernels.h +++ b/include/dsaX_cuda_kernels.h @@ -2,11 +2,11 @@ #include "dsaX_cuda_headers.h" -__device__ void inspectPackedDataInKernel(char input, int i) { +__global__ void inspectPackedDataInKernel(char input, int i) { float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); - if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im); + if(re != 0 || im != 0) printf("K val[%d] = (%f,%f)\n", i, re, im); } // KERNELS @@ -58,15 +58,20 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind float v1=0., v2=0.; + //if(idx<1) printf("output pre (%f, %f)\n", output[2*idx], output[2*idx+1]); + // Use CUDA casting intrinsic __half2float for (int i=0;i __global__ void transpose_matrix( y = blockIdx.x * 32 + threadIdx.y; width = gridDim.y * 32; - for (int j = 0; j < 32; j += 8) - odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + for (int j = 0; j < 32; j += 8) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x); + } +} + +// transpose kernel +// assume breakdown into tiles of 32x32, and run with 32x8 threads per block +// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32) +// here, width is the dimension of the fastest index +__global__ void transpose_matrix_float(half * idata, half * odata) { + + __shared__ float tile[32][33]; + + int x = blockIdx.x * 32 + threadIdx.x; + int y = blockIdx.y * 32 + threadIdx.y; + int width = gridDim.x * 32; + + for (int j = 0; j < 32; j += 8) { + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + //printf("K transpose_matrix_float_in[%d] = %f\n", (y+j)*width + x, __half2float(idata[(y+j)*width + x])); + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; // transpose block offset + y = blockIdx.x * 32 + threadIdx.y; + width = gridDim.y * 32; + for (int j = 0; j < 32; j += 8) { + odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; + //printf("K transpose_matrix_float_out[%d] = %f\n", (y+j)*width + x, __half2float(odata[(y+j)*width + x])); + } } + // DMH: TUNABLE // transpose kernel // assume breakdown into tiles of 32x32, and run with 32x8 threads per block @@ -126,6 +162,7 @@ __global__ void transpose_matrix_char(char * idata, char * odata) { for (int j = 0; j < blockDim.x; j += blockDim.y) { odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)]; + //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x); } } @@ -165,6 +202,7 @@ __global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini // Cast to float and use CUDA intrinsic to cast to signed half ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4)); + //good //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx])); } diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h index 9bffca0..607d9d3 100644 --- a/include/dsaX_enums.h +++ b/include/dsaX_enums.h @@ -57,5 +57,9 @@ typedef enum dsaXMemcpyKind_s { dsaXMemcpyHostToDevice = 1, dsaXMemcpyDeviceToHost = 2, dsaXMemcpyDeviceToDevice = 3, + dsaXMemcpyHostToHostAsync = 4, + dsaXMemcpyHostToDeviceAsync = 5, + dsaXMemcpyDeviceToHostAsync = 6, + dsaXMemcpyDeviceToDeviceAsync = 7, dsaXMemcpyInvalid = DSA_INVALID_ENUM } dsaXMemcpyKind; diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h index 9c35043..2f05432 100644 --- a/include/dsaX_ftd.h +++ b/include/dsaX_ftd.h @@ -1,18 +1,24 @@ #pragma once -//#include "dsaX_def.h" #include "dsaX_enums.h" #include "dsaX_params.h" +#include "timer.h" + +using ms = std::chrono::microseconds; +using hrc = std::chrono::high_resolution_clock; // define structures that carry around memory pointers // and metric. // DMH: make a base and inherit into corr and bf -typedef struct dmem_corr_s { +typedef struct corr_handle_s { // initial data and streams char *h_input; // host input pointer char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] + // DMH: fix me + void *d_idxs; + // correlator pointers // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times] void *d_r, *d_i; //half @@ -21,11 +27,34 @@ typedef struct dmem_corr_s { // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] float *d_output; - metrics metric_data; + dsaXCorrParam corr_param; + + double device_compute_flops; + double host_compute_flops; -} dmem_corr; + double H2D_bytes; + double D2H_bytes; + double D2D_bytes; + double H2H_bytes; -typedef struct dmem_bf_s { + // See 'using' at top of file for ms, hrc + timer::Timer dev_compute_timer; + timer::Timer dev_malloc_timer; + timer::Timer dev_memset_timer; + + timer::Timer H2D_timer; + timer::Timer D2H_timer; + timer::Timer D2D_timer; + timer::Timer H2H_timer; + + timer::Timer host_compute_timer; + timer::Timer host_malloc_timer; + timer::Timer host_memset_timer; + timer::Timer host_copy_timer; + +} corr_handle; + +typedef struct bf_handle_s { // beamformer pointers char *h_input; // host input pointer @@ -43,20 +72,47 @@ typedef struct dmem_bf_s { // timing (old) float cp, prep, cubl, outp; - metrics metric_data; + + // See 'using' at top of file ms, hrc + timer::Timer dev_compute_timer; + timer::Timer dev_malloc_timer; + timer::Timer dev_memset_timer; + + timer::Timer H2D_timer; + timer::Timer D2H_timer; + + timer::Timer host_compute_timer; + timer::Timer host_malloc_timer; + timer::Timer host_memset_timer; + timer::Timer host_copy_timer; + +} bf_handle; + +// Deprecated function, remove after development +void dcorrelator(corr_handle *d); + +// Base class +class dsaXBase { -} dmem_bf; + private: + protected: -void dcorrelator(dmem_corr *d); + public: + dsaXBase(); + ~dsaXBase(); + +}; -class Correlator { +class Correlator : public dsaXBase { private: protected: - - dmem_corr d; + + corr_handle d; dsaXCorrParam corr_param; dsaXBLASParam blas_param; + + uint64_t flops; public: @@ -72,8 +128,18 @@ class Correlator { ~Correlator(); }; -void destroyDsaXCorrDeviceMemory(dmem_corr *d); -void initDsaXCorrDeviceMemory(dmem_corr *d); -void reorderCorrelatorOutput(dmem_corr *d); -void reorderCorrelatorInput(dmem_corr *d); +void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams); +void destroyDsaXCorrDeviceMemory(corr_handle *d); +void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int n_streams); + +void initBLAS(); +void destroyBLAS(); + +void initStreams(unsigned int n); +void destroyStreams(); + +void computeIndices(corr_handle *d); +void reorderCorrelatorOutput(corr_handle *d, int stream); +void reorderCorrelatorInput(corr_handle *d, int stream); + diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h index a98215e..96442d1 100644 --- a/include/dsaX_interface.h +++ b/include/dsaX_interface.h @@ -6,9 +6,9 @@ // DMH: decorate these with Doxygen void dsaXCorrelator(void *input_data, void *output_data); -void reorderCorrInput(dmem_corr *d); +void reorderCorrInput(corr_handle *d, int stream = 0); -void reorderCorrOutput(dmem_corr *d); +void reorderCorrOutput(corr_handle *d, int stream = 0); void transposeInputBeamformer(double *input, double *output, std::vector &dimBlock, std::vector &dimGrid); @@ -17,3 +17,7 @@ void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb); void sumBeam(unsigned char *input, float *output, int blocks, int tpb); + +void dsaXInitStream(unsigned int n_streams); + +//void *dsaXHostRegister(size_t size); diff --git a/include/dsaX_params.h b/include/dsaX_params.h index bf5f455..85d2858 100644 --- a/include/dsaX_params.h +++ b/include/dsaX_params.h @@ -33,7 +33,7 @@ typedef struct dsaXBLASParam_s { std::complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ // Common params - int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ @@ -41,37 +41,16 @@ typedef struct dsaXBLASParam_s { // Structure that carries Correlator class parameters typedef struct dsaXCorrParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and DSA see the same struct*/ dsaXBLASLib blas_lib; /**< Which BLAS library to use for BLAS ops */ dsaXBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + + int n_streams; /**< The number streams over which to compute input data */ } dsaXCorrParam; -// Global timing and metrics structure for dsaX -typedef struct metrics_s { - - // Mem copy times - double mem_copy_time_H2H; - double mem_copy_time_H2D; - double mem_copy_time_D2H; - double mem_copy_time_D2D; - - // Mem copy size - double mem_copy_size_H2H; - double mem_copy_size_H2D; - double mem_copy_size_D2H; - double mem_copy_size_D2D; - - // Compute - double compute_time; - double compute_flops; - - // Initialisation - double initialisation_time; -} metrics; - // Parameter struct helper functions for user const char *getBLASLibString(dsaXBLASLib lib); const char *getBLASDataTypeString(dsaXBLASDataType type); diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h index fa22abe..fbc30fc 100644 --- a/include/dsaX_utils.h +++ b/include/dsaX_utils.h @@ -1,7 +1,10 @@ #pragma once #include "dsaX_params.h" +#include "timer.h" void dsaXmemset(void *array, int ch, size_t n); -void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind); + +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream = 0); + void dsaXDeviceSynchronize(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f885512..d79d89f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,9 +13,11 @@ endif() set(DSAX_OBJS dsaX_cuda_interface.cu dsaX_cublas_interface.cu + dsaX_cuda_handles.cu dsaX_magma_interface.cu dsaX_blas_interface.cpp dsaX_beamformer.cpp + dsaX_base.cpp dsaX_correlator.cpp dsaX_interface.cpp dsaX_utils.cpp diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp index 61fbc5d..2dc5aef 100644 --- a/src/dsaX_beamformer.cpp +++ b/src/dsaX_beamformer.cpp @@ -29,7 +29,7 @@ using namespace std; */ // beamformer function -void dbeamformer(dmem_bf *d) { +void dbeamformer(bf_handle *d) { dsaXBLASParam blas_param; blas_param.trans_a = DSA_BLAS_OP_T; diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp index e370e87..04be79b 100644 --- a/src/dsaX_blas_interface.cpp +++ b/src/dsaX_blas_interface.cpp @@ -4,13 +4,13 @@ #include "dsaX_cublas_interface.h" #include "dsaX_magma_interface.h" -void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param) { +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) { switch (param.blas_lib) { case DSA_BLAS_LIB_CUBLAS: - dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); break; case DSA_BLAS_LIB_MAGMA: - dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); break; case DSA_BLAS_LIB_CUTLASS: //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp index fecc184..4c3fe36 100644 --- a/src/dsaX_correlator.cpp +++ b/src/dsaX_correlator.cpp @@ -7,6 +7,7 @@ Workflow is similar for BF and corr applications */ #include +#include #include "dsaX_def.h" #include "dsaX.h" @@ -15,25 +16,36 @@ Workflow is similar for BF and corr applications #include "dsaX_utils.h" #include "dsaX_psrdada_utils.h" +using namespace std; + Correlator::Correlator(const dsaXCorrParam *param) { // Transfer passed param to internal objects corr_param = *param; - //printDsaXCorrParam(corr_param); + d.corr_param = *param; // Select back end BLAS engine blas_param.struct_size = sizeof(blas_param); blas_param.blas_type = DSA_BLAS_GEMM; blas_param.blas_lib = corr_param.blas_lib; + // Streams will be class specific + // so launch and destroy in the class + initStreams(corr_param.n_streams); + // Initialise device memeory - initDsaXCorrDeviceMemory(&d); + d.dev_malloc_timer.start(); + initDsaXCorrDeviceMemory(&d, corr_param.n_streams); + d.dev_malloc_timer.stop(); + + // Compute indices + computeIndices(&d); // gemm settings // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] #if defined OLD_BLAS - //std::cout << "Old params" << std::endl; + //cout << "Old params" << endl; blas_param.data_order = DSA_BLAS_DATAORDER_COL; blas_param.trans_a = DSA_BLAS_OP_A; blas_param.trans_b = DSA_BLAS_OP_T; @@ -53,7 +65,7 @@ Correlator::Correlator(const dsaXCorrParam *param) { blas_param.b_offset = 0; blas_param.c_offset = 0; #else - //std::cout << "My params" << std::endl; + //cout << "My params" << endl; blas_param.data_order = DSA_BLAS_DATAORDER_ROW; blas_param.trans_a = DSA_BLAS_OP_C; blas_param.trans_b = DSA_BLAS_OP_N; @@ -73,52 +85,115 @@ Correlator::Correlator(const dsaXCorrParam *param) { blas_param.b_offset = 0; blas_param.c_offset = 0; #endif - + // Swap A and B if in row order if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { - std::swap(blas_param.m, blas_param.n); - std::swap(blas_param.lda, blas_param.ldb); - std::swap(blas_param.trans_a, blas_param.trans_b); - std::swap(blas_param.a_offset, blas_param.b_offset); - std::swap(blas_param.a_stride, blas_param.b_stride); - //std::swap(A_data, B_data); - //std::swap(A_data, B_data); - } + swap(blas_param.m, blas_param.n); + swap(blas_param.lda, blas_param.ldb); + swap(blas_param.trans_a, blas_param.trans_b); + swap(blas_param.a_offset, blas_param.b_offset); + swap(blas_param.a_stride, blas_param.b_stride); + //swap(A_data, B_data); + //swap(A_data, B_data); + } + + printDsaXBLASParam(blas_param); + + flops = 8; // 8 complex flops per element + flops *= blas_param.m; + flops *= blas_param.n; + flops *= blas_param.k; + flops *= blas_param.batch_count; + + cout << "Correlator flops = 2*M*N*K * batch = (" << 2 << "*"<< blas_param.m << "*" << blas_param.n << "*" << blas_param.k << "*" << blas_param.batch_count << ") = " << flops << endl; + cout << "Correlator Gflop = " << (1e-9)*flops << endl; + + // DMH: reset counters method + } Correlator::~Correlator() { + + // Clean up memory destroyDsaXCorrDeviceMemory(&d); + destroyStreams(); + + // Transfer metrics to + double device_malloc_time = (1.0*d.dev_malloc_timer.elapsed().count())/(1e6); + double host_malloc_time = (1.0*d.host_malloc_timer.elapsed().count())/(1e6); + double device_compute_time = (1.0*d.dev_compute_timer.elapsed().count())/(1e6); + cout << "Correlator malloc time device = " << device_malloc_time << " seconds." << endl; + cout << "Correlator malloc time host = " << host_malloc_time << " seconds." << endl; + cout << "Correlator compute time device = " << device_compute_time << " seconds. " << endl; + + double h2d_time = (1.0*d.H2D_timer.elapsed().count())/(1e6); + cout << "Correlator H2D time = " << h2d_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.H2D_bytes)/pow(1024,3) / h2d_time << " Gbytes/second." << endl; + + double d2h_time = (1.0*d.D2H_timer.elapsed().count())/(1e6); + cout << "Correlator D2H time = " << d2h_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.D2H_bytes)/pow(1024,3) / d2h_time << " Gbytes/second." << endl; + + double h2h_time = (1.0*d.H2H_timer.elapsed().count())/(1e6); + cout << "Correlator H2H time = " << h2h_time << " seconds. "; + cout << "Bandwidth " << (1.0*d.H2H_bytes)/pow(1024,3) / h2h_time << " Gbytes/second." << endl; + + double total = device_malloc_time + host_malloc_time + device_compute_time + h2d_time + d2h_time; + cout << "Correlator TOTAL time = " << total << " seconds. " << endl; + + double Tflops = (1.0*d.dev_compute_timer.iterations()*(1e-12*flops)/device_compute_time); + cout << "Correlator Tflops = " << Tflops << endl; } void Correlator::compute(void *output, void *input) { - // zero out output arrays - dsaXmemset(d.d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short - dsaXmemset(d.d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short - dsaXmemset(d.d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float)); + uint64_t in_stream_block = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; + uint64_t out_stream_block = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2; + + unsigned int n_streams = corr_param.n_streams; - // copy to device - dsaXmemcpy(d.d_input, input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); + // Ensure output array is zero + dsaXmemset(d.d_output, 0, n_streams * out_stream_block); - // reorder input into real and imaginary arrays of half - reorderCorrInput(&d); + // Loop over the array in streams for concurrency. + for(int i=0; id_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short @@ -128,7 +203,7 @@ void dcorrelator(dmem_corr *d) { dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice); // reorder input into real and imaginary arrays of 2 byte data - reorderCorrInput(d); + reorderCorrInput(d, 0); dsaXBLASParam blas_param; blas_param.struct_size = sizeof(blas_param); @@ -139,7 +214,7 @@ void dcorrelator(dmem_corr *d) { // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] #if defined OLD_BLAS - //std::cout << "Old params" << std::endl; + //cout << "Old params" << endl; blas_param.data_order = DSA_BLAS_DATAORDER_COL; blas_param.trans_a = DSA_BLAS_OP_A; @@ -160,7 +235,7 @@ void dcorrelator(dmem_corr *d) { blas_param.b_offset = 0; blas_param.c_offset = 0; #else - //std::cout << "My params" << std::endl; + //cout << "My params" << endl; blas_param.data_order = DSA_BLAS_DATAORDER_ROW; blas_param.trans_a = DSA_BLAS_OP_C; @@ -184,13 +259,13 @@ void dcorrelator(dmem_corr *d) { // Swap A and B if in row order if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) { - std::swap(blas_param.m, blas_param.n); - std::swap(blas_param.lda, blas_param.ldb); - std::swap(blas_param.trans_a, blas_param.trans_b); - std::swap(blas_param.a_offset, blas_param.b_offset); - std::swap(blas_param.a_stride, blas_param.b_stride); - //std::swap(A_data, B_data); - //std::swap(A_data, B_data); + swap(blas_param.m, blas_param.n); + swap(blas_param.lda, blas_param.ldb); + swap(blas_param.trans_a, blas_param.trans_b); + swap(blas_param.a_offset, blas_param.b_offset); + swap(blas_param.a_stride, blas_param.b_stride); + //swap(A_data, B_data); + //swap(A_data, B_data); } diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu index 0bffaea..c528546 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/dsaX_cublas_interface.cu @@ -3,23 +3,55 @@ #include "dsaX.h" #include "dsaX_params.h" #include "dsaX_cuda_headers.h" +#include "dsaX_cuda_handles.h" +//#include "dsaX_cuda_kernels.h" // For debug using namespace std; -void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) { +__global__ void deviceInspectHalf(half *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUBLAS[%d]: device inspect half [%d] = %f\n", stage, x, __half2float(input[x])); +} + +void init_cublas_local() { + if (!cublas_init) { + //cublasError_t error = cudaStreamCreate(streams); + cublasStatus_t error = cublasCreate(&cublasH); + //cublasSetStream(handle, stream); + //cublasStatus_t error = cublasCreate(&handle); + if (error != CUBLAS_STATUS_SUCCESS) + cout << "cublasCreate failed with error " << error << endl; + else + cout << "cublasCreated successfully." << endl; + cublas_init = true; + } +} + +void destroy_cublas_local() { + if(cublas_init) + cublasDestroy(cublasH); + cublas_init = false; +} + +void initBLASCuda() { + init_cublas_local(); +} + +using namespace std; + +void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param, int stream) { #ifdef DSA_XENGINE_TARGET_CUDA // not sure if essential - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); + + cublasSetStream(cublasH, get_stream(stream)); + + bool verbose = false; // Set up for gemm - cublasHandle_t cublasH = NULL; - cudaStream_t stream = NULL; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - cublasCreate(&cublasH); - cublasSetStream(cublasH, stream); - - // Transfer params + //---------------- + // Transfer params const int m = blas_param.m; const int n = blas_param.n; const int k = blas_param.k; @@ -71,9 +103,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void default: std::cout << "Unknown cublas transpose" << std::endl; } - + int B_imag_alpha_sign = alpha; - switch (blas_param.trans_b) { + switch (blas_param.trans_b) { case DSA_BLAS_OP_N: transb = CUBLAS_OP_N; break; @@ -93,7 +125,7 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void // A array requests conjugation, hence we // must apply supply a factor of -1 to alpha // when dealing with the imaginary component - // of A. + // of B. B_imag_alpha_sign *= -1; break; default: @@ -102,9 +134,11 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void // Run strided batched gemm for datatype // (a + ib)(c + id) = (ac - bd) + i(bc + ad) - // on matrices alpha * op(A) * op(B) + beta * C + // on matrices C = alpha * op(A) * op(B) + beta * C // where op(M) is defined by the transposition variable // cublasOperation_t transM + + //deviceInspectHalf<<<1, 8>>>((half *)real_a); // Accumulate results into C matrix // ac @@ -114,6 +148,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void (half *)real_b + b_offset, ldb, strideB, &beta0, (half *)real_c + c_offset, ldc, strideC, batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 0); + // -bd (minus sign from i*i) half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign); cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd), @@ -121,6 +158,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void (half*)imag_b + b_offset, ldb, strideB, &beta1, (half*)real_c + c_offset, ldc, strideC, batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 1); + // bc half alpha_bc = alpha * A_imag_alpha_sign; cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc), @@ -128,6 +168,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void (half*)real_b + b_offset, ldb, strideB, &beta0, (half*)imag_c + c_offset, ldc, strideC, batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 2); + // ad half alpha_ad = alpha * B_imag_alpha_sign; cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad), @@ -135,13 +178,11 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void (half*)imag_b + b_offset, ldb, strideB, &beta1, (half*)imag_c + c_offset, ldc, strideC, batchCount); + + if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 3); - // shown to be essential - cudaDeviceSynchronize(); - - // destroy stream - cudaStreamDestroy(stream); - cublasDestroy(cublasH); + // shown to be essential (only with streams, fix me) + //cudaDeviceSynchronize(); #else std::cout "dsaX not built with CUDA target." << std::endl; exit(0); diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu index ec54675..b8af344 100644 --- a/src/dsaX_cuda_interface.cu +++ b/src/dsaX_cuda_interface.cu @@ -4,43 +4,86 @@ #include "dsaX_cuda_headers.h" #include "dsaX_cuda_interface.h" #include "dsaX_cuda_kernels.h" +#include "dsaX_cuda_handles.h" using namespace std; +// DMH: Everything in this file is CUDA aware. + +__global__ void deviceInspectHalfCI(half *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUDA_INTERFACE[%d]: device inspect half [%d] = %f\n", stage, x, __half2float(input[x])); +} + +__global__ void deviceInspectFloatCI(float *input, int stage) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + printf("CUDA_INTERFACE[%d]: device inspect float [%d] = %f\n", stage, x, input[x]); +} + void dsaXInitCuda(int dev){ - cudaSetDevice(dev); + if(dev >= 0) cudaSetDevice(dev); + else { + cout << "dsaX Error: invalid device ordinal " << dev << " passed to dsaX." << endl; + exit(0); + } +} + +void initStreamsCuda(unsigned int n_streams){ + init_streams(n_streams); +} + +void destroyStreamsCuda(){ + destroy_streams(); +} + +void dsaXDestroyCuda(int dev){ + // +} + +void *dsaXHostRegisterCuda(size_t size) { + + void *ptr = malloc(size); + cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault); + if (err != cudaSuccess) { + cout << "dsaX Error: Failed to register pinned memory of size " << size << endl; + exit(0); + } + return ptr; } // allocate device memory -void initializeCorrCudaMemory(dmem_corr *d) { - +void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) { + // for correlator - cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); - cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac); + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + + // DMH: fix me + cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE); } -void initializeBFCudaMemory(dmem_bf *d) { +void initializeBFCudaMemory(bf_handle *d, int n_streams) { // for beamformer - cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2); - cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2); - cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); - cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)); - cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)); - cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor - cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor + cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams); + cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams); + cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); + cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); + cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams); + cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams); + cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)*n_streams); + cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)*n_streams); // beam scale factor + cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); // beam scale factor // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I] d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2)); @@ -56,7 +99,7 @@ void initializeBFCudaMemory(dmem_bf *d) { } // deallocate device memory -void deallocateCorrCudaMemory(dmem_corr *d) { +void deallocateCorrCudaMemory(corr_handle *d) { cudaFree(d->d_input); cudaFree(d->d_r); @@ -67,10 +110,11 @@ void deallocateCorrCudaMemory(dmem_corr *d) { cudaFree(d->d_outi); cudaFree(d->d_tx_outr); cudaFree(d->d_tx_outi); + cudaFree(d->d_idxs); } // deallocate device memory -void deallocateBFCudaMemory(dmem_bf *d) { +void deallocateBFCudaMemory(bf_handle *d) { cudaFree(d->d_input); cudaFree(d->d_tx); @@ -89,89 +133,52 @@ void deallocateBFCudaMemory(dmem_bf *d) { free(d->h_freqs); } +void computeIndicesCuda(corr_handle *d) { + + // now run kernel to sum into output + int *h_idxs = (int *)malloc(sizeof(int)*NBASE); + int ii = 0; + // upper triangular order (column major) to match xGPU (not the same as CASA!) + for (int i=0; id_idxs, h_idxs, sizeof(int)*NBASE, cudaMemcpyHostToDevice); + free(h_idxs); +} + // function to copy d_outr and d_outi to d_output // inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS] // the corr matrices are column major order // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex] // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel -void reorderCorrOutputCuda(dmem_corr * d) { +void reorderCorrOutputCuda(corr_handle *d, int stream) { + + cudaStream_t str = get_stream(stream); + + uint64_t input_offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + uint64_t output_offset = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2 * stream; // transpose input data #if defined (OLD_BLAS) dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32); - transpose_matrix<<>>((half*)d->d_outr, (half*)d->d_tx_outr); - transpose_matrix<<>>((half*)d->d_outi, (half*)d->d_tx_outi); -#endif - // look at output - /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac); - cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost); - FILE *fout; - fout=fopen("test2.test","wb"); - fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout); - fclose(fout);*/ - + transpose_matrix_float<<>>((half*)d->d_outr, (half*)d->d_tx_outr); + transpose_matrix_float<<>>((half*)d->d_outi, (half*)d->d_tx_outi); +#endif - /* - // set up for geam - cublasHandle_t cublasH = NULL; - cudaStream_t stream = NULL; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - cublasSetStream(cublasH, stream); - - // transpose output matrices into tx_outr and tx_outi - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - const int m = NCHAN_PER_PACKET*2*2; - const int n = NANTS*NANTS/16; // columns in output - const double alpha = 1.0; - const double beta = 0.0; - const int lda = n; - const int ldb = m; - const int ldc = ldb; - cublasDgeam(cublasH,transa,transb,m,n, - &alpha,(double *)(d->d_outr), - lda,&beta,(double *)(d->d_tx_outr), - ldb,(double *)(d->d_tx_outr),ldc); - cublasDgeam(cublasH,transa,transb,m,n, - &alpha,(double *)(d->d_outi), - lda,&beta,(double *)(d->d_tx_outi), - ldb,(double *)(d->d_tx_outi),ldc); - */ - // now run kernel to sum into output - int * h_idxs = (int *)malloc(sizeof(int)*NBASE); - int * d_idxs; - cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE); - int ii = 0; - // upper triangular order (column major) to match xGPU (not the same as CASA!) - for (int i=0;i>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs); + corr_output_copy<<>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, (int*)d->d_idxs); #else - corr_output_copy<<>>((half*)d->d_outr, (half*)d->d_outi, d->d_output, d_idxs); -#endif - - /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4); - cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost); - FILE *fout; - fout=fopen("test3.test","wb"); - fwrite(odata,sizeof(char),384*4*NBASE*4,fout); - fclose(fout);*/ - - cudaFree(d_idxs); - free(h_idxs); - //cudaStreamDestroy(stream); + corr_output_copy<<>>((half*)d->d_outr + input_offset, (half*)d->d_outi + input_offset, d->d_output + output_offset, (int*)d->d_idxs); +#endif + //deviceInspectHalfCI<<<1,8>>>((half*)d->d_outi, 0); } @@ -182,22 +189,43 @@ void reorderCorrOutputCuda(dmem_corr * d) { // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS] // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form. // then fluffs using simple kernel -void reorderCorrInputCuda(dmem_corr *d) { +void reorderCorrInputCuda(corr_handle *d, int stream) { + + // DMH: globalise me + int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + + cudaStream_t str = get_stream(stream); + + // TUNABLE + int blockDim = 128; + int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; // transpose input data #if defined (OLD_BLAS) dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32); - // TUNABLE - int blockDim = 128; - int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; - transpose_matrix_char<<>>(d->d_input, d->d_tx); - promoteComplexCharToPlanarHalf<<>>(d->d_tx, (half*)d->d_r, (half*)d->d_i); + transpose_matrix_char<<>>((char*)d->d_input + offset, (char*)d->d_tx + offset); + + // DMH: These two can run concurrently + promoteComplexCharToPlanarHalf<<>>((char*)d->d_tx + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); #else - promoteComplexCharToPlanarHalf<<>>(d->d_input, (half*)d->d_r, (half*)d->d_i); + promoteComplexCharToPlanarHalf<<>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); #endif } +void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream) { + + // DMH: globalise me + int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream; + + cudaStream_t str = get_stream(stream); + + // TUNABLE + int blockDim = 128; + int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim; + + promoteComplexCharToPlanarHalf<<>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset); +} // kernels to reorder and fluff input data for beamformer // initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] @@ -223,7 +251,7 @@ void transposeInputBeamformerCuda(double *idata, double *odata, std::vector // sequential pairs of eastings and northings // then [NANTS, 48, R/I] calibs -void calcWeightsCuda(dmem_bf *d) { +void calcWeightsCuda(bf_handle *d) { // allocate float *antpos_e = (float *)malloc(sizeof(float)*NANTS); @@ -323,21 +351,36 @@ void dsaXDeviceSynchronizeCuda() { cudaDeviceSynchronize(); } -void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ +void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){ + cudaError error = cudaSuccess; + cudaStream_t str = get_stream(stream); + switch(kind) { case dsaXMemcpyHostToHost: error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost); break; case dsaXMemcpyHostToDevice: - error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice); - break; + error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice); + break; case dsaXMemcpyDeviceToHost: error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost); break; case dsaXMemcpyDeviceToDevice: error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice); break; + case dsaXMemcpyHostToHostAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToHost, str); + break; + case dsaXMemcpyHostToDeviceAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToDevice, str); + break; + case dsaXMemcpyDeviceToHostAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToHost, str); + break; + case dsaXMemcpyDeviceToDeviceAsync: + error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToDevice, str); + break; default: std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl; } diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp index 6358df1..e0f294a 100644 --- a/src/dsaX_interface.cpp +++ b/src/dsaX_interface.cpp @@ -10,82 +10,131 @@ using namespace std; +using ms = std::chrono::microseconds; +using hrc = std::chrono::high_resolution_clock; + +timer::Timer app_timer; +timer::Timer init_timer; void dsaXInit(int dev){ + app_timer.start(); #if DSA_XENGINE_TARGET_CUDA + init_timer.start(); dsaXInitCuda(dev); + initBLAS(); + init_timer.stop(); #endif - - std::cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl; - std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl; - std::cout << "NCHAN = " << NCHAN << std::endl; - std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl; - std::cout << "NPOL = " << NPOL << std::endl; - std::cout << "NARM = " << 3 << std::endl; - std::cout << " --- End dsaX configuration --- " << endl; + cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl; + cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << endl; + cout << "NCHAN = " << NCHAN << endl; + cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << endl; + cout << "NPOL = " << NPOL << endl; + cout << "NARM = " << 2 << endl; +#if DSA_XENGINE_TARGET_CUDA + cout << "CUDA is ENABLED " << endl; +#else + cout << "CUDA is DISABLED " << endl; +#endif + cout << " --- End dsaX configuration --- " << endl; //DMH: Add more (ask Vikram) } void dsaXEnd() { + app_timer.stop(); // output metrics + cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl; + cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl; +} + +void *dsaXHostRegister(size_t size) { +#if DSA_XENGINE_TARGET_CUDA + return dsaXHostRegisterCuda(size); +#endif } void inspectPackedData(char input, int i, bool non_zeros) { float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4); - + if(non_zeros) { if(re != 0 || im != 0) - std::cout << "val["< &dimBlock, std::vector &dimGrid) { +void transposeInputBeamformer(double *input, double *output, vector &dimBlock, vector &dimGrid) { #if DSA_XENGINE_TARGET_CUDA transposeInputBeamformerCuda(input, output, dimBlock, dimGrid); #else - std::cout << "dsaX error: not implemented" << std::endl; + cout << "dsaX error: not implemented" << endl; #endif } -void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, std::vector &dimBlock, std::vector &dimGrid) { +void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, vector &dimBlock, vector &dimGrid) { #if DSA_XENGINE_TARGET_CUDA transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid); #else - std::cout << "dsaX error: not implemented" << std::endl; + cout << "dsaX error: not implemented" << endl; #endif } @@ -93,7 +142,7 @@ void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int b #if DSA_XENGINE_TARGET_CUDA fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb); #else - std::cout << "dsaX error: not implemented" << std::endl; + cout << "dsaX error: not implemented" << endl; #endif } @@ -101,6 +150,6 @@ void sumBeam(unsigned char *input, float *output, int blocks, int tpb) { #if DSA_XENGINE_TARGET_CUDA sumBeamCuda(input, output, blocks, tpb); #else - std::cout << "dsaX error: not implemented" << std::endl; + cout << "dsaX error: not implemented" << endl; #endif } diff --git a/src/dsaX_params.cpp b/src/dsaX_params.cpp index 7ed0d5b..4179848 100644 --- a/src/dsaX_params.cpp +++ b/src/dsaX_params.cpp @@ -56,11 +56,13 @@ const char *getBLASDataOrderString(dsaXBLASDataOrder order) void printDsaXCorrParam(const dsaXCorrParam param) { - cout << "--- dsaXCorrParam begin ---" << endl; + cout << " --- dsaXCorrParam begin ---" << endl; cout << "struct_size = " << param.struct_size << endl; - cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; - cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; - cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; + cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; + cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; + cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; + cout << "n_streams = " << param.n_streams << endl; + cout << " --- dsaXCorrParam end ---" << endl; } @@ -69,9 +71,9 @@ void printDsaXBLASParam(const dsaXBLASParam param) { cout << " --- dsaXBLASParam begin ---" << endl; cout << "struct_size = " << param.struct_size << endl; cout << "blas_type = " << param.blas_type << endl; - cout << "blas_lib = " << param.blas_lib << endl; - cout << "data_type = " << param.data_type << endl; - cout << "data_order = " << param.data_order << endl; + cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl; + cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl; + cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl; cout << "trans_a = " << param.trans_a << endl; cout << "trans_b = " << param.trans_b << endl; cout << "m = " << param.m << endl; diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp index 3819e98..d29e291 100644 --- a/src/dsaX_utils.cpp +++ b/src/dsaX_utils.cpp @@ -15,10 +15,11 @@ void dsaXmemset(void *array, int ch, size_t n){ #endif } -void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){ +void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){ + #ifdef DSA_XENGINE_TARGET_CUDA // Perform host to device memcopy on data - dsaXmemcpyCuda(array_out, array_in, n, kind); + dsaXmemcpyCuda(array_out, array_in, n, kind, stream); #else memcpy(array_out, array_in, n); #endif @@ -33,19 +34,25 @@ void dsaXDeviceSynchronize() { #endif } -void initDsaXCorrDeviceMemory(dmem_corr *d) { +void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams) { + #ifdef DSA_XENGINE_TARGET_CUDA - initializeCorrCudaMemory(d); + d->dev_malloc_timer.start(); + initializeCorrCudaMemory(d, n_streams); + d->dev_malloc_timer.stop(); #else cout << "dsaX Error: Not implemented." << endl; exit(0); #endif } -void destroyDsaXCorrDeviceMemory(dmem_corr *d) { +void destroyDsaXCorrDeviceMemory(corr_handle *d) { + #ifdef DSA_XENGINE_TARGET_CUDA + d->dev_malloc_timer.start(); deallocateCorrCudaMemory(d); -#else + d->dev_malloc_timer.stop(); +#else cout << "dsaX Error: Not implemented." << endl; exit(0); #endif diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp index 69331b3..746b4cc 100644 --- a/tests/command_line_params.cpp +++ b/tests/command_line_params.cpp @@ -5,21 +5,27 @@ int core = 0; bool debug = false; // Data block HDU keys -key_t in_key = REORDER_BLOCK_KEY; -key_t out_key = XGPU_BLOCK_KEY; +key_t in_key = 0x0000eada; // REORDER_BLOCK_KEY in dsaX_def.h +key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h -// Test mode +// Test params bool run_beamformer = false; bool run_correlator = false; -double start_frequency = 1498.75; +bool input_rands = false; +bool write_output = false; +int test_iter = 1; +int n_streams = 10; -// Test file +// Test files std::string input_filename = "input.dat"; std::string output_filename = "output.dat"; + +// DSA hardware configuration int n_channels = 384; int n_antennae = 63; int n_pol = 2; int n_times = 30720; +double start_frequency = 1498.75; std::shared_ptr make_app(std::string app_description, std::string app_name) { @@ -32,15 +38,19 @@ std::shared_ptr make_app(std::string app_description, std::string app_n dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]"); dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]"); dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]"); - dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)"); - + dsaX_app->add_option("--test-iter", test_iter, "Run the test 'test_iter' times [default 1]"); + dsaX_app->add_option("--write-output", write_output, "Write output to disk [default true]"); + dsaX_app->add_option("--n-streams", n_streams, "The number of device streams [default 10]"); + // Input file options + dsaX_app->add_option("--input-rands", input_rands, "Generate random input (default false)"); dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests"); + dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results"); dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]"); dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]"); dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]"); dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]"); - + dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)"); return dsaX_app; } diff --git a/tests/command_line_params.h b/tests/command_line_params.h index 06e67ac..fb9bd1a 100644 --- a/tests/command_line_params.h +++ b/tests/command_line_params.h @@ -1,7 +1,6 @@ #pragma once #include -#include class dsaXApp : public CLI::App { @@ -24,12 +23,16 @@ extern key_t out_key; // Test mode extern bool run_beamformer; extern bool run_correlator; -extern double start_frequency; +extern bool input_rands; +extern bool write_output; +extern int test_iter; +extern int n_streams; -// Test file +// DSA hardware configureation extern std::string input_filename; extern std::string output_filename; extern int n_channels; extern int n_antennae; extern int n_pol; extern int n_times; +extern double start_frequency; diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp index 2ce7390..dfb58f0 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/dsaX_correlator_test.cpp @@ -6,6 +6,7 @@ #include #include #include +#include // Include this file to access input parameters #include "command_line_params.h" @@ -93,32 +94,66 @@ int main(int argc, char **argv) { return app->exit(e); } - int device_ordinal = 0; + int device_ordinal = 0; + int packet_size = 4608; + + // Create a data array for a single call to the correlator class FILE *fin, *fout; - uint64_t sz, output_size, in_block_size, rd_size; + uint64_t sz, in_block_size, rd_size; in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; - int nreps = 1, nchunks = 1; + + std::cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl; + char *file_data = (char *)malloc(in_block_size); // read one block of input data // get size of file - std::cout << "attempting to read file " << input_filename.c_str() << std::endl; - fin=fopen(input_filename.c_str(), "rb"); - fseek(fin, 0L, SEEK_END); - sz = ftell(fin); - rewind(fin); - - // figure out how many reps and chunks to read with - if (sz > in_block_size) { - nreps = (int)(sz/in_block_size); - rd_size = in_block_size; - } - else { - nchunks = (int)(in_block_size/sz); - rd_size = sz; - } + if(!input_rands) { + std::cout << "attempting to read file " << input_filename.c_str() << std::endl; + fin = fopen(input_filename.c_str(), "rb"); + fseek(fin, 0L, SEEK_END); + sz = ftell(fin); + if(sz != packet_size) { + cout << "Error: packet size " << packet_size << " and file size " << sz << " are unequal." << endl; + exit(0); + } + rewind(fin); + + // figure out how many reps and chunks to read with + int nreps, nchunks; + if (sz > in_block_size) { + nreps = (int)(sz/in_block_size); + rd_size = in_block_size; + } + else { + nchunks = (int)(in_block_size/sz); + rd_size = sz; + } + + cout << "Packet size = " << sz << endl; + cout << "rd size = " << rd_size << endl; + for (int reps = 0; reps dis; + for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen); + //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234; + memcpy(file_data, (void*)input_rand, n_rand); + free(input_rand); + } + // Start dsaX program //--------------------------------------- + timer::Timer test_timer; + dsaXInit(device_ordinal); // Create Correlator class instance. @@ -126,46 +161,49 @@ int main(int argc, char **argv) { param.blas_lib = DSA_BLAS_LIB_CUBLAS; param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX; param.data_order = DSA_BLAS_DATAORDER_ROW; + param.n_streams = n_streams; printDsaXCorrParam(param); + auto correlator = new Correlator(¶m); - output_size = NBASE*NCHAN_PER_PACKET*2*2*4; - std::cout << "Creating char output_array of size " << (1.0*sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2)/pow(1024,2) << " MB." << std::endl; - char *output_data = (char *)malloc(output_size); + // Create GPU registered memory if using CUDA + uint64_t input_size = n_streams*sizeof(char)*in_block_size; + std::cout << "Creating char input array of size " << input_size << " bytes." << std::endl; + void *input_data = dsaXHostRegister(input_size); + // Populate with random data. Each stream has the same data + // To ensure the concurrency does not pollute accross streams. + for (int i = 0; i0) rewind(fin); - fread(input_data + chunks*rd_size, rd_size, 1, fin); - - //std::cout << "Input peek " << std::endl; - //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i); - - // run correlator and record output data - //dsaXCorrelator((void*)output_data, (void*)input_data, ¶m); - correlator->compute((void*)output_data, (void*)input_data); - - //std::cout << "Output peek " << std::endl; - //for(int i=0; icompute(output_data, input_data); + test_timer.stop(); + + //std::cout << "Output peek " << std::endl; + float *p = (float*)output_data; + for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl; + + if(write_output) { + fout = fopen(output_filename.c_str(),"ab"); + fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout); + fclose(fout); } - + + delete correlator; dsaXEnd(); + + std::cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl; + // End dsaX program //--------------------------------------- @@ -174,8 +212,7 @@ int main(int argc, char **argv) { free(output_data); return 0; - /* - + /* // Read data BinaryFileVector binaryFileVector; From 48d6b6d3171525261dfd878cc4aaad3ed52e005d Mon Sep 17 00:00:00 2001 From: cpviolator Date: Thu, 4 Jul 2024 21:40:40 -0700 Subject: [PATCH 28/30] Rename files for convenient auto complete in CL --- include/CMakeLists.txt | 10 +- ...dsaX_blas_interface.h => blas_interface.h} | 2 +- ..._cublas_interface.h => cublas_interface.h} | 0 include/cuda_handles.h | 20 ++ .../{dsaX_cuda_headers.h => cuda_headers.h} | 0 ...dsaX_cuda_interface.h => cuda_interface.h} | 2 +- .../{dsaX_cuda_kernels.h => cuda_kernels.h} | 2 +- ...utlass_interface.h => cutlass_interface.h} | 0 include/dsaX.h | 6 +- include/dsaX_beamformer_correlator.h | 9 - include/{dsaX_enums.h => enums.h} | 0 include/{dsaX_ftd.h => fast_time_domain.h} | 4 +- include/{dsaX_interface.h => interface.h} | 0 .../{dsaX_magma_headers.h => magma_headers.h} | 0 ...aX_magma_interface.h => magma_interface.h} | 0 include/{dsaX_params.h => params.h} | 2 +- .../{dsaX_psrdada_utils.h => psrdada_utils.h} | 2 +- include/{dsaX_utils.h => utils.h} | 2 +- src/CMakeLists.txt | 37 ++-- src/{dsaX_beamformer.cpp => beamformer.cpp} | 6 +- src/{dsaX_correlator.cpp => correlator.cpp} | 8 +- ...ublas_interface.cu => cublas_interface.cu} | 6 +- src/cuda_handles.cu | 64 +++++++ ...aX_cuda_interface.cu => cuda_interface.cu} | 8 +- ...lass_interface.cu => cutlass_interface.cu} | 0 src/dsaX_blas_interface.cpp | 28 --- src/{dsaX_interface.cpp => interface.cpp} | 8 +- ..._magma_interface.cu => magma_interface.cu} | 6 +- src/{dsaX_params.cpp => params.cpp} | 2 +- ...aX_psrdada_utils.cpp => psrdada_utils.cpp} | 2 +- src/{dsaX_utils.cpp => utils.cpp} | 8 +- tests/CMakeLists.txt | 4 +- tests/command_line_params.cpp | 4 +- ...orrelator_test.cpp => correlator_test.cpp} | 174 ++++++++++++++---- 34 files changed, 284 insertions(+), 142 deletions(-) rename include/{dsaX_blas_interface.h => blas_interface.h} (85%) rename include/{dsaX_cublas_interface.h => cublas_interface.h} (100%) create mode 100644 include/cuda_handles.h rename include/{dsaX_cuda_headers.h => cuda_headers.h} (100%) rename include/{dsaX_cuda_interface.h => cuda_interface.h} (98%) rename include/{dsaX_cuda_kernels.h => cuda_kernels.h} (99%) rename include/{dsaX_cutlass_interface.h => cutlass_interface.h} (100%) delete mode 100644 include/dsaX_beamformer_correlator.h rename include/{dsaX_enums.h => enums.h} (100%) rename include/{dsaX_ftd.h => fast_time_domain.h} (98%) rename include/{dsaX_interface.h => interface.h} (100%) rename include/{dsaX_magma_headers.h => magma_headers.h} (100%) rename include/{dsaX_magma_interface.h => magma_interface.h} (100%) rename include/{dsaX_params.h => params.h} (99%) rename include/{dsaX_psrdada_utils.h => psrdada_utils.h} (93%) rename include/{dsaX_utils.h => utils.h} (89%) rename src/{dsaX_beamformer.cpp => beamformer.cpp} (98%) rename src/{dsaX_correlator.cpp => correlator.cpp} (98%) rename src/{dsaX_cublas_interface.cu => cublas_interface.cu} (98%) create mode 100644 src/cuda_handles.cu rename src/{dsaX_cuda_interface.cu => cuda_interface.cu} (99%) rename src/{dsaX_cutlass_interface.cu => cutlass_interface.cu} (100%) delete mode 100644 src/dsaX_blas_interface.cpp rename src/{dsaX_interface.cpp => interface.cpp} (97%) rename src/{dsaX_magma_interface.cu => magma_interface.cu} (86%) rename src/{dsaX_params.cpp => params.cpp} (99%) rename src/{dsaX_psrdada_utils.cpp => psrdada_utils.cpp} (90%) rename src/{dsaX_utils.cpp => utils.cpp} (91%) rename tests/{dsaX_correlator_test.cpp => correlator_test.cpp} (53%) diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 58b1566..65ddb04 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -6,14 +6,14 @@ set(DSA_XENGINE_HEADERS # cmake-format: sortable dsaX.h dsaX_def.h - dsaX_ftd.h - dsaX_cuda_interface.h - dsaX_cuda_handles.h - dsaX_cuda_headers.h + fast_time_domain.h + cuda_interface.h + cuda_handles.h + cuda_headers.h dsaX_capture.h dsaX_capture_manythread.h dsaX_capture_pcap.h - dsaX_cutlass_interface.h + cutlass_interface.h ) install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) #------------------------------ diff --git a/include/dsaX_blas_interface.h b/include/blas_interface.h similarity index 85% rename from include/dsaX_blas_interface.h rename to include/blas_interface.h index 4c6edaf..d643e08 100644 --- a/include/dsaX_blas_interface.h +++ b/include/blas_interface.h @@ -1,5 +1,5 @@ #pragma once -#include "dsaX_interface.h" +#include "interface.h" void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0); diff --git a/include/dsaX_cublas_interface.h b/include/cublas_interface.h similarity index 100% rename from include/dsaX_cublas_interface.h rename to include/cublas_interface.h diff --git a/include/cuda_handles.h b/include/cuda_handles.h new file mode 100644 index 0000000..eeaf706 --- /dev/null +++ b/include/cuda_handles.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include "utils.h" + +#ifdef DSA_XENGINE_TARGET_CUDA +#include "cuda_headers.h" + +static std::vector streams; +static cublasHandle_t cublasH = NULL; + +static bool cublas_init = false; +static bool stream_init = false; + +cudaStream_t get_stream(unsigned int i); +#endif + +void init_streams(unsigned int n_streams); +void destroy_streams(); diff --git a/include/dsaX_cuda_headers.h b/include/cuda_headers.h similarity index 100% rename from include/dsaX_cuda_headers.h rename to include/cuda_headers.h diff --git a/include/dsaX_cuda_interface.h b/include/cuda_interface.h similarity index 98% rename from include/dsaX_cuda_interface.h rename to include/cuda_interface.h index 4ad2aed..6ae59e2 100644 --- a/include/dsaX_cuda_interface.h +++ b/include/cuda_interface.h @@ -3,7 +3,7 @@ #include #include "dsaX_def.h" -#include "dsaX_enums.h" +#include "enums.h" #include "dsaX.h" void dsaXInitCuda(int dev); diff --git a/include/dsaX_cuda_kernels.h b/include/cuda_kernels.h similarity index 99% rename from include/dsaX_cuda_kernels.h rename to include/cuda_kernels.h index 49e9ff0..d57a11b 100644 --- a/include/dsaX_cuda_kernels.h +++ b/include/cuda_kernels.h @@ -1,6 +1,6 @@ #pragma once -#include "dsaX_cuda_headers.h" +#include "cuda_headers.h" __global__ void inspectPackedDataInKernel(char input, int i) { float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4); diff --git a/include/dsaX_cutlass_interface.h b/include/cutlass_interface.h similarity index 100% rename from include/dsaX_cutlass_interface.h rename to include/cutlass_interface.h diff --git a/include/dsaX.h b/include/dsaX.h index eab6f75..8aff8c5 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -3,9 +3,9 @@ // Expose the use to compile time definitions, // enums, parameters, and classes #include "dsaX_def.h" -#include "dsaX_enums.h" -#include "dsaX_params.h" -#include "dsaX_ftd.h" +#include "enums.h" +#include "params.h" +#include "fast_time_domain.h" // Use manual transpose route // Uncomment to try new pure cuBLAS diff --git a/include/dsaX_beamformer_correlator.h b/include/dsaX_beamformer_correlator.h deleted file mode 100644 index 7001f4a..0000000 --- a/include/dsaX_beamformer_correlator.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -// correlator function -// workflow: copy to device, reorder, stridedBatchedGemm, reorder -void dcorrelator(dmem *d); - -// beamformer function -void dbeamformer(dmem * d); - diff --git a/include/dsaX_enums.h b/include/enums.h similarity index 100% rename from include/dsaX_enums.h rename to include/enums.h diff --git a/include/dsaX_ftd.h b/include/fast_time_domain.h similarity index 98% rename from include/dsaX_ftd.h rename to include/fast_time_domain.h index 2f05432..98ce8ff 100644 --- a/include/dsaX_ftd.h +++ b/include/fast_time_domain.h @@ -1,7 +1,7 @@ #pragma once -#include "dsaX_enums.h" -#include "dsaX_params.h" +#include "enums.h" +#include "params.h" #include "timer.h" using ms = std::chrono::microseconds; diff --git a/include/dsaX_interface.h b/include/interface.h similarity index 100% rename from include/dsaX_interface.h rename to include/interface.h diff --git a/include/dsaX_magma_headers.h b/include/magma_headers.h similarity index 100% rename from include/dsaX_magma_headers.h rename to include/magma_headers.h diff --git a/include/dsaX_magma_interface.h b/include/magma_interface.h similarity index 100% rename from include/dsaX_magma_interface.h rename to include/magma_interface.h diff --git a/include/dsaX_params.h b/include/params.h similarity index 99% rename from include/dsaX_params.h rename to include/params.h index 85d2858..08ff440 100644 --- a/include/dsaX_params.h +++ b/include/params.h @@ -2,7 +2,7 @@ #include -#include "dsaX_enums.h" +#include "enums.h" // Structure that carries BLAS parameters // This should be able to communicate to all diff --git a/include/dsaX_psrdada_utils.h b/include/psrdada_utils.h similarity index 93% rename from include/dsaX_psrdada_utils.h rename to include/psrdada_utils.h index 2dc3dec..2b60bf3 100644 --- a/include/dsaX_psrdada_utils.h +++ b/include/psrdada_utils.h @@ -9,7 +9,7 @@ #include "dada_affinity.h" #include "ascii_header.h" #include "dsaX_def.h" -#include "dsaX_enums.h" +#include "enums.h" void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out); diff --git a/include/dsaX_utils.h b/include/utils.h similarity index 89% rename from include/dsaX_utils.h rename to include/utils.h index fbc30fc..96a7004 100644 --- a/include/dsaX_utils.h +++ b/include/utils.h @@ -1,6 +1,6 @@ #pragma once -#include "dsaX_params.h" +#include "params.h" #include "timer.h" void dsaXmemset(void *array, int ch, size_t n); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d79d89f..67f8543 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,18 +11,18 @@ endif() # DSA Fast Time Domain library #----------------------------- set(DSAX_OBJS - dsaX_cuda_interface.cu - dsaX_cublas_interface.cu - dsaX_cuda_handles.cu - dsaX_magma_interface.cu - dsaX_blas_interface.cpp - dsaX_beamformer.cpp + cuda_interface.cu + cublas_interface.cu + cuda_handles.cu + magma_interface.cu + blas_interface.cpp + beamformer.cpp dsaX_base.cpp - dsaX_correlator.cpp - dsaX_interface.cpp - dsaX_utils.cpp - dsaX_params.cpp - dsaX_psrdada_utils.cpp + correlator.cpp + interface.cpp + utils.cpp + params.cpp + psrdada_utils.cpp ) # split source into cu and cpp files @@ -115,18 +115,3 @@ install(TARGETS lib ) #----------------------------- - -# install step for executables -#----------------------------- -install(TARGETS - # cmake-format: sortable - #dsaX_beamformer_correlator - RUNTIME DESTINATION - bin - ) -#----------------------------- - -if(CUDAToolkit_FOUND) - #add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu) - #target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsax ${CUDA_cublas_LIBRARY} ${PSRDada_LIB}) -endif() diff --git a/src/dsaX_beamformer.cpp b/src/beamformer.cpp similarity index 98% rename from src/dsaX_beamformer.cpp rename to src/beamformer.cpp index 2dc5aef..e99a54c 100644 --- a/src/dsaX_beamformer.cpp +++ b/src/beamformer.cpp @@ -11,9 +11,9 @@ Workflow is similar for BF and corr applications #include "dsaX_def.h" #include "dsaX.h" -#include "dsaX_blas_interface.h" -#include "dsaX_utils.h" -#include "dsaX_psrdada_utils.h" +#include "blas_interface.h" +#include "utils.h" +#include "psrdada_utils.h" using namespace std; diff --git a/src/dsaX_correlator.cpp b/src/correlator.cpp similarity index 98% rename from src/dsaX_correlator.cpp rename to src/correlator.cpp index 4c3fe36..e45595d 100644 --- a/src/dsaX_correlator.cpp +++ b/src/correlator.cpp @@ -11,10 +11,10 @@ Workflow is similar for BF and corr applications #include "dsaX_def.h" #include "dsaX.h" -#include "dsaX_ftd.h" -#include "dsaX_blas_interface.h" -#include "dsaX_utils.h" -#include "dsaX_psrdada_utils.h" +#include "fast_time_domain.h" +#include "blas_interface.h" +#include "utils.h" +#include "psrdada_utils.h" using namespace std; diff --git a/src/dsaX_cublas_interface.cu b/src/cublas_interface.cu similarity index 98% rename from src/dsaX_cublas_interface.cu rename to src/cublas_interface.cu index c528546..234e18a 100644 --- a/src/dsaX_cublas_interface.cu +++ b/src/cublas_interface.cu @@ -1,9 +1,9 @@ #include #include "dsaX.h" -#include "dsaX_params.h" -#include "dsaX_cuda_headers.h" -#include "dsaX_cuda_handles.h" +#include "params.h" +#include "cuda_headers.h" +#include "cuda_handles.h" //#include "dsaX_cuda_kernels.h" // For debug using namespace std; diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu new file mode 100644 index 0000000..9b65281 --- /dev/null +++ b/src/cuda_handles.cu @@ -0,0 +1,64 @@ +#include +#include +#include + +using namespace std; + +#ifdef DSA_XENGINE_TARGET_CUDA + +// CUDA stream handler functions +//------------------------- +void init_streams(unsigned int n_streams) { + + if(n_streams < 2 || n_streams > 9) { + cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl; + exit(0); + } + + if(!stream_init) { + streams.reserve(n_streams); + for (auto &s : streams) cudaStreamCreate(&s); + /* + int greatestPriority; + int leastPriority; + + // Query the device to get its built in priority range + // For CUDA, lower numerical values indicate higher priority + cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority); + for (int i=0; i #include -#include "dsaX_cuda_headers.h" -#include "dsaX_cuda_interface.h" -#include "dsaX_cuda_kernels.h" -#include "dsaX_cuda_handles.h" +#include "cuda_headers.h" +#include "cuda_interface.h" +#include "cuda_kernels.h" +#include "cuda_handles.h" using namespace std; diff --git a/src/dsaX_cutlass_interface.cu b/src/cutlass_interface.cu similarity index 100% rename from src/dsaX_cutlass_interface.cu rename to src/cutlass_interface.cu diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp deleted file mode 100644 index 04be79b..0000000 --- a/src/dsaX_blas_interface.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#include "dsaX.h" -#include "dsaX_cublas_interface.h" -#include "dsaX_magma_interface.h" - -void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) { - switch (param.blas_lib) { - case DSA_BLAS_LIB_CUBLAS: - dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); - break; - case DSA_BLAS_LIB_MAGMA: - //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); - break; - case DSA_BLAS_LIB_CUTLASS: - //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); - break; - case DSA_BLAS_LIB_OPENBLAS: - //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); - break; - case DSA_BLAS_LIB_TCC: - //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); - break; - default: - std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl; - exit(0); - } -} diff --git a/src/dsaX_interface.cpp b/src/interface.cpp similarity index 97% rename from src/dsaX_interface.cpp rename to src/interface.cpp index e0f294a..31dc832 100644 --- a/src/dsaX_interface.cpp +++ b/src/interface.cpp @@ -3,10 +3,10 @@ #include #include -#include "dsaX_params.h" -#include "dsaX_cuda_interface.h" -#include "dsaX_utils.h" -#include "dsaX_ftd.h" +#include "params.h" +#include "cuda_interface.h" +#include "utils.h" +#include "fast_time_domain.h" using namespace std; diff --git a/src/dsaX_magma_interface.cu b/src/magma_interface.cu similarity index 86% rename from src/dsaX_magma_interface.cu rename to src/magma_interface.cu index eabfdbf..af91a52 100644 --- a/src/dsaX_magma_interface.cu +++ b/src/magma_interface.cu @@ -1,9 +1,9 @@ #include #include "dsaX.h" -#include "dsaX_params.h" -#include "dsaX_cuda_headers.h" -#include "dsaX_magma_headers.h" +#include "params.h" +#include "cuda_headers.h" +#include "magma_headers.h" using namespace std; diff --git a/src/dsaX_params.cpp b/src/params.cpp similarity index 99% rename from src/dsaX_params.cpp rename to src/params.cpp index 4179848..723264c 100644 --- a/src/dsaX_params.cpp +++ b/src/params.cpp @@ -1,6 +1,6 @@ #include -#include "dsaX_params.h" +#include "params.h" using namespace std; diff --git a/src/dsaX_psrdada_utils.cpp b/src/psrdada_utils.cpp similarity index 90% rename from src/dsaX_psrdada_utils.cpp rename to src/psrdada_utils.cpp index 07c16e6..3978ecd 100644 --- a/src/dsaX_psrdada_utils.cpp +++ b/src/psrdada_utils.cpp @@ -1,4 +1,4 @@ -#include "dsaX_psrdada_utils.h" +#include "psrdada_utils.h" void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out) { diff --git a/src/dsaX_utils.cpp b/src/utils.cpp similarity index 91% rename from src/dsaX_utils.cpp rename to src/utils.cpp index d29e291..cc4194d 100644 --- a/src/dsaX_utils.cpp +++ b/src/utils.cpp @@ -1,9 +1,9 @@ #include -#include "dsaX_utils.h" -#include "dsaX_enums.h" -#include "dsaX_params.h" -#include "dsaX_cuda_interface.h" +#include "utils.h" +#include "enums.h" +#include "params.h" +#include "cuda_interface.h" using namespace std; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4a93c15..64aa8db 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(${CLI11_SOURCE_DIR}/include/CLI) add_library(dsaX_tests command_line_params.cpp) -add_executable(dsaX_correlator_test dsaX_correlator_test.cpp) -target_link_libraries(dsaX_correlator_test dsaX dsaX_tests) +add_executable(correlator_test correlator_test.cpp) +target_link_libraries(correlator_test dsaX dsaX_tests) diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp index 746b4cc..82c02e8 100644 --- a/tests/command_line_params.cpp +++ b/tests/command_line_params.cpp @@ -11,10 +11,10 @@ key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h // Test params bool run_beamformer = false; bool run_correlator = false; -bool input_rands = false; +bool input_rands = true; bool write_output = false; int test_iter = 1; -int n_streams = 10; +int n_streams = 8; // Test files std::string input_filename = "input.dat"; diff --git a/tests/dsaX_correlator_test.cpp b/tests/correlator_test.cpp similarity index 53% rename from tests/dsaX_correlator_test.cpp rename to tests/correlator_test.cpp index dfb58f0..6f8d6df 100644 --- a/tests/dsaX_correlator_test.cpp +++ b/tests/correlator_test.cpp @@ -8,35 +8,132 @@ #include #include +using namespace std; + // Include this file to access input parameters #include "command_line_params.h" +// Include this file to access test utilities +/** + * Promote complex char riri... data to planar half rr.. ii.. + * + * @param[out] inr float precision real array + * @param[out] ini float precision imag array + * @param[in] input char precision complex array + * @param[in] rows number of rows + * @param[in] cols number of cols + */ +template void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) { + +#pragma omp parallel for collapse(2) + int idx = 0; + for(int i=0; i> 4); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + output[2*idx+1] = (prec)((char)(( (unsigned char)(input[idx]) & (unsigned char)(240) )) >> 4); + } + } +} + +// Assume ROW ordered data in interleaved format +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) { + +#pragma omp parallel for collapse(2) + for(int i=0; i prec test_hermiticity(const prec *C, const int m, const int n) { + + prec frob_norm = 0.0; + +#pragma omp parallel for collapse(2) reduction (+:frob_norm) + for(int i=0; i -using namespace std; - // The class offers entire file content read/write in single operation -class BinaryFileVector : public std::vector +class BinaryFileVector : public vector { public: - using std::vector::vector; + using vector::vector; bool loadFromFile(const char *fileName) noexcept { // Try to open a file specified by its name - std::ifstream file(fileName, std::ios::in | std::ios::binary); + ifstream file(fileName, ios::in | ios::binary); if (!file.is_open() || file.bad()) return false; // Clear whitespace removal flag - file.unsetf(std::ios::skipws); + file.unsetf(ios::skipws); // Determine size of the file - file.seekg(0, std::ios_base::end); + file.seekg(0, ios_base::end); size_t fileSize = file.tellg(); - file.seekg(0, std::ios_base::beg); + file.seekg(0, ios_base::beg); // Discard previous vector content resize(0); @@ -48,15 +145,15 @@ class BinaryFileVector : public std::vector // Read entire file content into prealocated vector memory insert(begin(), - std::istream_iterator(file), - std::istream_iterator()); + istream_iterator(file), + istream_iterator()); // Make sure entire content is loaded if(size() == fileSize) { - std::cout << "Successfully read file of size " << fileSize << std::endl; + cout << "Successfully read file of size " << fileSize << endl; return true; } else { - std::cout << "Unexpected file size." << std::endl; + cout << "Unexpected file size." << endl; return false; } } @@ -64,7 +161,7 @@ class BinaryFileVector : public std::vector bool saveToFile(const char *fileName) const noexcept { // Write entire vector content into a file specified by its name - std::ofstream file(fileName, std::ios::out | std::ios::binary); + ofstream file(fileName, ios::out | ios::binary); try { file.write((const char *) data(), size()); } @@ -75,10 +172,10 @@ class BinaryFileVector : public std::vector // Determine number of bytes successfully stored in file size_t fileSize = file.tellp(); if(size() == fileSize) { - std::cout << "Successfully wrote file of size " << fileSize << std::endl; + cout << "Successfully wrote file of size " << fileSize << endl; return true; } else { - std::cout << "Unexpected file size." << std::endl; + cout << "Unexpected file size." << endl; return false; } } @@ -102,13 +199,13 @@ int main(int argc, char **argv) { uint64_t sz, in_block_size, rd_size; in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2; - std::cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl; + cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << endl; char *file_data = (char *)malloc(in_block_size); // read one block of input data // get size of file if(!input_rands) { - std::cout << "attempting to read file " << input_filename.c_str() << std::endl; + cout << "attempting to read file " << input_filename.c_str() << endl; fin = fopen(input_filename.c_str(), "rb"); fseek(fin, 0L, SEEK_END); sz = ftell(fin); @@ -140,10 +237,10 @@ int main(int argc, char **argv) { int n_rand = in_block_size/sizeof(uint64_t); uint64_t *input_rand = (uint64_t*)malloc(n_rand); - std::random_device rd; - std::mt19937_64 gen(rd()); + random_device rd; + mt19937_64 gen(rd()); gen.seed(1234); - std::uniform_int_distribution dis; + uniform_int_distribution dis; for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen); //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234; memcpy(file_data, (void*)input_rand, n_rand); @@ -152,7 +249,7 @@ int main(int argc, char **argv) { // Start dsaX program //--------------------------------------- - timer::Timer test_timer; + timer::Timer test_timer; dsaXInit(device_ordinal); @@ -168,7 +265,7 @@ int main(int argc, char **argv) { // Create GPU registered memory if using CUDA uint64_t input_size = n_streams*sizeof(char)*in_block_size; - std::cout << "Creating char input array of size " << input_size << " bytes." << std::endl; + cout << "Creating char input array of size " << input_size << " bytes." << endl; void *input_data = dsaXHostRegister(input_size); // Populate with random data. Each stream has the same data // To ensure the concurrency does not pollute accross streams. @@ -176,20 +273,33 @@ int main(int argc, char **argv) { // Create GPU registered output array uint64_t output_size = n_streams * sizeof(float) * NBASE*NCHAN_PER_PACKET*2*2; - std::cout << "Creating float output_array of size " << output_size << " bytes." << std::endl; + cout << "Creating float output_array of size " << output_size << " bytes." << endl; void *output_data = dsaXHostRegister(output_size); + /* + float *A = (float*)dsaXHostRegister(2*sizeof(float)*96*512); + float *B = (float*)dsaXHostRegister(2*sizeof(float)*96*512); + float *C = (float*)dsaXHostRegister(2*sizeof(float)*96*96); + promoteComplexCharToFloat(A, file_data, 512, 96); + promoteComplexCharToFloat(B, file_data, 512, 96); + host_MdagM_gemm(A, B, C, 96, 96, 512); + */ + // Ensure test output array is zero memset(output_data, 0, output_size); - std::cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl; - std::cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl; + cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl; + cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl; test_timer.start(); correlator->compute(output_data, input_data); test_timer.stop(); + + float frob_norm = test_hermiticity((float*)output_data, 96, 96); + cout << "Frobenius norm = " << frob_norm << endl; + - //std::cout << "Output peek " << std::endl; + //cout << "Output peek " << endl; float *p = (float*)output_data; for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl; @@ -202,7 +312,7 @@ int main(int argc, char **argv) { delete correlator; dsaXEnd(); - std::cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl; + cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl; // End dsaX program //--------------------------------------- @@ -218,7 +328,7 @@ int main(int argc, char **argv) { if (!binaryFileVector.loadFromFile(test_filename.c_str())) { - std::cout << "Failed to read the file." << std::endl; + cout << "Failed to read the file." << endl; return 0; } @@ -237,14 +347,14 @@ int main(int argc, char **argv) { for (int i=0; i<8; i++) inspectPackedData(input_data[i], i); // Peek at output data (delete after development is complete) - for (int i=0; i Date: Thu, 4 Jul 2024 21:46:43 -0700 Subject: [PATCH 29/30] Add untracked files --- include/dsaX.h | 3 +- include/timer.h | 85 +++++++++++++ src/blas_interface.cpp | 28 +++++ src/dsaX_base.cpp | 9 ++ tests/utils.cpp | 89 ++++++++++++++ tests/utils.h | 5 + tests/utils/.gitignore | 2 - tests/utils/CMakeLists.txt | 11 -- tests/utils/CMakeLists.txt~ | 22 ---- tests/utils/gen_packet.py | 216 ---------------------------------- tests/utils/get_rms.py | 141 ---------------------- tests/utils/get_rms_packet.py | 36 ------ tests/utils/packet.out | Bin 4608 -> 0 bytes tests/utils/sockets.py | 31 ----- tests/utils/test.out | Bin 196608 -> 0 bytes 15 files changed, 218 insertions(+), 460 deletions(-) create mode 100644 include/timer.h create mode 100644 src/blas_interface.cpp create mode 100644 src/dsaX_base.cpp create mode 100644 tests/utils.cpp create mode 100644 tests/utils.h delete mode 100644 tests/utils/.gitignore delete mode 100644 tests/utils/CMakeLists.txt delete mode 100644 tests/utils/CMakeLists.txt~ delete mode 100644 tests/utils/gen_packet.py delete mode 100644 tests/utils/get_rms.py delete mode 100644 tests/utils/get_rms_packet.py delete mode 100644 tests/utils/packet.out delete mode 100644 tests/utils/sockets.py delete mode 100644 tests/utils/test.out diff --git a/include/dsaX.h b/include/dsaX.h index 8aff8c5..ff2772c 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -18,9 +18,10 @@ #define sep 1.0 // arcmin void dsaXInit(int device_ordinal = -1); + void dsaXEnd(); -//void dsaX + void *dsaXHostRegister(size_t size); diff --git a/include/timer.h b/include/timer.h new file mode 100644 index 0000000..6607d5d --- /dev/null +++ b/include/timer.h @@ -0,0 +1,85 @@ +// Copyright (C) 2023 by Mark Melton +// + +#pragma once +#include +#include + +namespace timer { + + template + inline void doNotOptimizeAway(const T& val) { + asm volatile("" : : "r,m"(val) : "memory"); + } + +#ifdef __clang__ + template + inline void doNotOptimizeAway(T& value) { + asm volatile("" : "+r,m"(value) : : "memory"); + } +#else + template + inline void doNotOptimizeAway(T& value) { + asm volatile("" : "+m,r"(value) : : "memory"); + } +#endif + + inline void doNotReorderBarrier() { + std::atomic_signal_fence(std::memory_order_acq_rel); + } + + /// The Timer class template implements a timer designed for minimal + /// overhead, ad-hoc timing of code regions including micro-timing + /// down to single machine instructions. + template + class Timer { + public: + using TimePoint = typename Clock::time_point; + + /// Run the supplied `code` in a loop `n` times. + template + Timer& run(size_t n, Code&& code) { + start(); + for (auto i = 0ul; i < n; ++i) { + code(); + } + stop(n); + return *this; + } + + /// Start the timer. + void start() { + start_ = Clock::now(); + } + + /// Stop the timer indicating `n` operations. + auto stop(size_t n = 1) { + auto end = Clock::now(); + iterations_ += n; + elapsed_ += std::chrono::duration_cast(end - start_); + return elapsed_; + } + + /// Return the average number of nanoseconds per operation. + auto elapsed_per_iteration() const { + return iterations_ > 0 ? (double)elapsed_.count() / iterations_ : 0.0; + } + + /// Return the elapsed duration. + auto elapsed() const { + return elapsed_; + } + + /// Return the iterations. + auto iterations() const { + return iterations_; + } + + private: + TimePoint start_{}; + Duration elapsed_{}; + size_t iterations_{}; + }; + +}; // timer diff --git a/src/blas_interface.cpp b/src/blas_interface.cpp new file mode 100644 index 0000000..ed76f05 --- /dev/null +++ b/src/blas_interface.cpp @@ -0,0 +1,28 @@ +#include + +#include "dsaX.h" +#include "cublas_interface.h" +#include "magma_interface.h" + +void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) { + switch (param.blas_lib) { + case DSA_BLAS_LIB_CUBLAS: + dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); + break; + case DSA_BLAS_LIB_MAGMA: + //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream); + break; + case DSA_BLAS_LIB_CUTLASS: + //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_OPENBLAS: + //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + case DSA_BLAS_LIB_TCC: + //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param); + break; + default: + std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl; + exit(0); + } +} diff --git a/src/dsaX_base.cpp b/src/dsaX_base.cpp new file mode 100644 index 0000000..80a947a --- /dev/null +++ b/src/dsaX_base.cpp @@ -0,0 +1,9 @@ +#include "fast_time_domain.h" + +dsaXBase::dsaXBase() { + +} + +dsaXBase::~dsaXBase() { + +} diff --git a/tests/utils.cpp b/tests/utils.cpp new file mode 100644 index 0000000..bc10104 --- /dev/null +++ b/tests/utils.cpp @@ -0,0 +1,89 @@ +#include "utils.h" + +/** + * Promote complex char riri... data to planar half rr.. ii.. + * + * @param[out] inr float precision real array + * @param[out] ini float precision imag array + * @param[in] input char precision complex array + * @param[in] rows number of rows + * @param[in] cols number of cols + */ +template void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) { + +#pragma omp parallel for collapse(2) + int idx = 0; + for(int i=0; i> 4); + + // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr + // to get imag part 4 bit data + // iiii0000. + // Cast to signed char + // +-iii0000 + // Bitshift mantisa only to the right by 4 bits + // +-0000iii + // Cast to float and use CUDA intrinsic to cast to signed half + output[2*idx+1] = (prec)((char)(( (unsigned char)(input[2*idx+1]) & (unsigned char)(240) )) >> 4); + } + } +} + +// Assume ROW ordered data in interleaved format +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) { + +#pragma omp parallel for collapse(2) + for(int i=0; i prec test_hermiticity(const prec *C, const int m, const int n) { + + prec frob_norm = 0.0; + +#pragma omp parallel for collapse(2) reduction (+:frob_norm) + for(int i=0; i void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols); +template void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k); +template prec test_hermiticity(const prec *C, const int m, const int n); diff --git a/tests/utils/.gitignore b/tests/utils/.gitignore deleted file mode 100644 index dafcc02..0000000 --- a/tests/utils/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -antennas.out -gen_antennas.py diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt deleted file mode 100644 index 226c9de..0000000 --- a/tests/utils/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# install step for utils -#------------------------------ -set(DSA_XENGINE_UTILS - # cmake-format: sortable - gen_packet.py - get_rms_packet.py - get_rms.py - sockets.py - ) -install(FILES ${DSA_XENGINE_UTILS} DESTINATION utils) -#------------------------------ diff --git a/tests/utils/CMakeLists.txt~ b/tests/utils/CMakeLists.txt~ deleted file mode 100644 index ab053c5..0000000 --- a/tests/utils/CMakeLists.txt~ +++ /dev/null @@ -1,22 +0,0 @@ -# install step for utils -#------------------------------ -set(DSA_XENGINE_UTILS - # cmake-format: sortable -/home/dmhowart/DSA110/dsa110-xengine/src/dsaX_bfCorr.cu dsaX_capture.h - dsaX_capture_manythread.h - dsaX_capture_pcap.h - dsaX_def.h - dsaX_cutlass_interface.h - ) -install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include) -#------------------------------ - -# install step for executables -#----------------------------- -install(TARGETS - # cmake-format: sortable - dsaX_bfCorr - RUNTIME DESTINATION - bin - ) -#----------------------------- diff --git a/tests/utils/gen_packet.py b/tests/utils/gen_packet.py deleted file mode 100644 index 2ae1bee..0000000 --- a/tests/utils/gen_packet.py +++ /dev/null @@ -1,216 +0,0 @@ -import numpy as np, struct -import matplotlib.pyplot as plt - - -''' The aim here is to make two types of data packets: - - one with a tone at a particular frequency and set of antennas - - one with pure noise - -Structure is 3 ant, 384 chan, 2 time, 2 pol, r/i -4608 bytes long - -''' - - -def make_spectrum(packet,ant=0,pol=0): - - spec = np.zeros(384*2) - - d = np.asarray(struct.unpack('>4608B',packet)) - - # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped - d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() - - d_r = ((d & 15) << 4) - d_i = d & 240 - d_r = d_r.astype(np.int8)/16 - d_i = d_i.astype(np.int8)/16 - - spec += d_r**2.+d_i**2. - spec = spec.reshape((384,2)).mean(axis=1) - return(spec) - -def plot_spectrum(data,ant=0,pol=0): - - spec = make_spectrum(data,ant=ant,pol=pol) - plt.plot(spec) - plt.xlabel('Channel') - plt.ylabel('Power') - plt.show() - -def make_histogram(packet): - ''' Makes histogram of packet - tested - ''' - - histo = np.zeros(16) - rms = 0. - - d = np.asarray(struct.unpack('>4608B',packet)) - - # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped - d = (d.reshape((3,384,2,2))).ravel() - - d_r = ((d & 15) << 4) - d_i = d & 240 - d_r = d_r.astype(np.int8)/16 - d_i = d_i.astype(np.int8)/16 - - rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.) - - hx = np.arange(16)-8 - - for i in range(384*2): - - histo[int(d_r[i])+8] += 1. - histo[int(d_i[i])+8] += 1. - - return(hx,histo/np.max(histo),np.sqrt(rms)) - -def histo_test(data): - - hx,histo,rms = make_histogram(data) - print('HISTOGRAM: ') - for i in range(16): - print(hx[i],histo[i]) - print() - print('RMS = ',rms) - print() - - -########## MAIN ############ - -# defaults -outfile = 'packet.out' -n_packet = 4608 # 4608 for single packet - -# decide which sort of packet to make -noise = True -tone = False -x16 = False - -# if tone -if tone is True: - - # defaults: - chans = np.arange(384)#np.asarray([10,100,190]) - #ant = 1 - amp_A = 9.0 - amp_B = 4. - - # derived quantities - amp_A = 16.*np.sqrt(amp_A) - amp_B = 16.*np.sqrt(amp_B) - ph = 2.*np.pi*np.random.uniform() - ramp_A = amp_A*np.cos(ph) - iamp_A = amp_A*np.sin(ph) - ph = 2.*np.pi*np.random.uniform() - ramp_B = amp_B*np.cos(ph) - iamp_B = amp_B*np.sin(ph) - - # make packet - real_part = np.zeros(n_packet,dtype='int8') - imag_part = np.zeros(n_packet,dtype='int8') - for ant in [0,1,2]: - for i in chans: - - # time 1 pol A - j = int(1536*ant + i*4) - real_part[j] = round(ramp_A) - imag_part[j] = round(iamp_A) - - # time 1 pol B - j = int(1536*ant + i*4 + 1) - real_part[j] = round(ramp_B) - imag_part[j] = round(iamp_B) - - # time 2 pol A - j = int(1536*ant + i*4 + 2) - real_part[j] = round(ramp_A) - imag_part[j] = round(iamp_A) - - # time 2 pol B - j = int(1536*ant + i*4 + 3) - real_part[j] = round(ramp_B) - imag_part[j] = round(iamp_B) - - - # make 4-bit versions - real_part = np.cast['uint8'](real_part) - imag_part = np.cast['uint8'](imag_part) - for i in range(n_packet): - real_part[i] = real_part[i] >> 4 - imag_part[i] = (imag_part[i] >> 4) << 4 - - # finish packet - packet = np.zeros(n_packet,dtype='uint8') - for i in range(n_packet): - packet[i] = real_part[i] | imag_part[i] - - # if x16 - if (x16): - - p2 = np.zeros(21*n_packet,dtype='uint8') - for i in range(21): - p2[i*n_packet:(i+1)*n_packet] = packet - - out_str = p2.tobytes() - - else: - - out_str = packet.tobytes() - -# if noise -if noise is True: - - # defaults - rms = 1.5 # 4-bit - erms = rms*16. - - # make real and imag parts - real_part = np.zeros(n_packet,dtype='int8') - imag_part = np.zeros(n_packet,dtype='int8') - - for ant in [0, 1, 2]: - for i in np.arange(384): - - # time 1 pol A - j = int(1536*ant + i*4) - real_part[j] = round(np.random.normal()*erms) - imag_part[j] = round(np.random.normal()*erms) - - # time 1 pol B - j = int(1536*ant + i*4 + 1) - real_part[j] = round(np.random.normal()*erms) - imag_part[j] = round(np.random.normal()*erms) - - # time 2 pol A - j = int(1536*ant + i*4 + 2) - real_part[j] = round(np.random.normal()*erms) - imag_part[j] = round(np.random.normal()*erms) - - # time 2 pol B - j = int(1536*ant + i*4 + 3) - real_part[j] = round(np.random.normal()*erms) - imag_part[j] = round(np.random.normal()*erms) - - # make 4-bit versions - real_part = np.cast['uint8'](real_part) - imag_part = np.cast['uint8'](imag_part) - for i in range(n_packet): - real_part[i] = real_part[i] >> 4 - imag_part[i] = (imag_part[i] >> 4) << 4 - - # finish packet - packet = np.zeros(n_packet,dtype='uint8') - for i in range(n_packet): - packet[i] = real_part[i] | imag_part[i] - - out_str = packet.tobytes() - - -newFile = open(outfile, "wb") -newFile.write(out_str) -newFile.close() - - -#plot_spectrum(out_str,pol=1,ant=1) diff --git a/tests/utils/get_rms.py b/tests/utils/get_rms.py deleted file mode 100644 index 8854a36..0000000 --- a/tests/utils/get_rms.py +++ /dev/null @@ -1,141 +0,0 @@ -import numpy as np -import sockets as s -import struct -import sys -import matplotlib.pyplot as plt - -# for file writing - -def write_bin(data,fl='test.dat'): - - f = open(fl,'w+b') - for packet in data: - d = bytearray(np.asarray(struct.unpack('>4616B',packet))[8:].astype(np.int8)) - print(len(d)) - f.write(d) - - f.close() - - -# for making histogram of input - -def make_histogram(data,ant=0,pol=0): - - histo = np.zeros(16) - rms = 0. - - for packet in data: - - d = np.asarray(struct.unpack('>4616B',packet))[8:] - - # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped - d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() - - d_r = ((d & 15) << 4) - d_i = d & 240 - d_r = d_r.astype(np.int8)/16 - d_i = d_i.astype(np.int8)/16 - - rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.) - - for i in range(384*2): - - histo[int(d_r[i])+8] += 1. - histo[int(d_i[i])+8] += 1. - - return histo/np.max(histo),np.sqrt(rms) - -# for making spectrum from data -def decode_data(data,ant=0,pol=0): - - spec = np.zeros(384*2) - - for packet in data: - - d = np.asarray(struct.unpack('>4616B',packet))[8:] - - # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped - d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel() - - d_r = ((d & 15) << 4) - d_i = d & 240 - d_r = d_r.astype(np.int8)/16 - d_i = d_i.astype(np.int8)/16 - - spec += d_r**2.+d_i**2. - - spec = spec.reshape((384,2)).mean(axis=1) - return(spec) - -# for decoding packets -def decode_header(data): - - min_s = 10000 - max_s = 0 - - for packet in data: - - d = np.asarray(struct.unpack('>4616B',packet)) - - # packet id - p = 0 - p = p | ((d[4] & 224) >> 5) - p = p | (d[3] << 3) - p = p | (d[2] << 11) - p = p | (d[1] << 19) - p = p | (d[0] << 27) - - # spectrum id - sp = 0 - sp = sp | ((d[4] & 31) << 8) - sp = sp | d[5] - - if (spmax_s): - max_s = sp - - print(p,sp) - - print(min_s,max_s) - -# MAIN - -n = 10000 -ip = '10.41.0.62' -port=4011 -data = s.capture(ip=ip,port=port,n=n) -ant=0 -pol=0 - -#decode_header(data) - -histo,rms = make_histogram(data,ant=ant,pol=pol) -print() -print('RMS:',rms/np.sqrt(1.*n)) -for i in np.arange(16): - print(histo[i],' ',) - -sys.exit() - -spec = decode_data(data,ant=ant,pol=pol) -spec = np.sqrt(spec/n/2.) -print() -print('Have spectral points',len(spec)) -print() -#for i in np.arange(len(spec)): -# print(spec[i],' ',) - -plt.plot(spec) -plt.show() - - - - - - - - - - - diff --git a/tests/utils/get_rms_packet.py b/tests/utils/get_rms_packet.py deleted file mode 100644 index f75d278..0000000 --- a/tests/utils/get_rms_packet.py +++ /dev/null @@ -1,36 +0,0 @@ -import socket, numpy as np -from progress.bar import Bar -import sockets as s -import struct -import sys -import matplotlib.pyplot as plt - -# ip as string, port as int, buf as int -def capture(n=100,ip=None,port=None,buf=4616): - - if ip is None: - print('No IP') - return() - - if port is None: - print('No port') - return() - - sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM) - sock.bind((ip,port)) - - captured=0 - packs = [] - bar = Bar('Capturing '+str(n)+' packets...', max=n) - while capturede4S(FWz+KX5=k0#xq zA)|sjI#5k2{iJ=yq++YCR0dKIVIz(?m~kZogI6z&!a&zbZ_VX!pkwSO$i?_`U7D?Q zO6%YMTb-6B32}ihoNJcxl&B%ce@x}Pn3oTga+5A|*b1zJnj55|on7qw&Rlh;Q+HJR zojoghQ;sU#$lqxTlvKGwe5ocJ$&$6|KKn7C2Ev|MHx<4V3Hm{h)y?Kp&FueZL=#d6AId+dFrQ8 zqqFDwx?}{*DktmHlwLSX2+;Uv#8DNQJSg+ppHWyp5~0FbpJ-B|PZ`6Sq6S4!G;e&; z%I6@`3L@Z9Ku)5xd#t9$8d7nDfc>{`Wn;lZpMP{kOx+Rg^2#-?B9ULP@JaQrUixR@ zm*7p~f58eQ!P$b$i0<3AVzae6Lwnb(@msxf8IRD2h1yf!M{7jvbKxdZc<)l`BuW*g zbI!YGo$jxRmdi|z4E@nK6&@2}1PD#HT>u$bM7p?pWeWdBk3s-hFizHtv#;2M29B3^ zdLumMp-qjmsfeB7H=EQ;{MxzpUq|rzWS#JQxu9!whL@BY89ELG(&uVN>)UU?_sXH~C6B5@Wmki{`lB=a#w}MMYrezVvywXohr0YSlqqEe^+u@ZYMwlr zrLa>0TXH_5Q-bmQR1Mw`(;pb+o@^BrxaAsMr+ej*0IT0I=E3kv5}`Q4RfCIm^rR=F zm9;x7Rgz!_9%$%%hpeF?dd6sF6@?*l4**itB0%sN3O<7lY&tzQ}WBRqmN{YVYqKsJysaXubTB_Q4v7m^U&+8pNV$ zP^6s6Nl|Mpx5tZG4hrPa9(W9~;VFhqes6H9;M!AK7NRNFgGEzbBN zJqa<(e#qauBg>pf*zqtKVRzIjn_N-21+KqiP7=5MO3x#{=A zBFmyHz$CPaN;S66>70Al$)AoU)R;#JmHsui($&nILH$?buqVhIe=FurX=I93-)gv zmZty*?!M^l5q8c2@uf_>mGQGa9<+GLcag^dPi~uf)xQtU5t#b-TsEj-5a;aw3F^9Y zN*V!;-b*N?RGD%_BjM;+*LfzAR1e`2Gl@ynu0h2p^5w;m7pXD44loY(_cxOz|RU^bv=!=1iR1G`V6vm z2~t@iZKr%yms;WZx^&)@yAa6sl40)fL({ri$KqaoQb%OTqPQ3qu;--h-!g-wwM5nK z2dr{Y`?|4^$eFEFzT55DbdsxlFb+Fx2RQ6;d6y0uTe;22_ZZBo>dLX?PYy?8wNK}u z@8z|Zte$RmzGIRn+A*W5)rywggL{pvumhVI5spX^x`C$@_>eRAB?clUc)bv}^Z0qd zXtZp*CKg?gU*~&@PNRXozbi2CFjT+dhn_hDhGF=d{7$pHZPXAb zZ0pNh>m{IaO~O1$o_a+Ue-w6eIWtL8UJF~S_AUV6IfhYi$M3bPWH7N8Exfr?pNM}~ z|HJFB%Ds3s>$tCZ4Ytx|krLBMQ2O=WxZ_kRK|A4u{*8QHa23ftgYFBv#r(8nIL++s zdo5gT$4_ryKhhjce)8-Zfsf5H-z^Nr9~rMM=TWnfq)R;E1e?RH=u})hm?5$KTaV^h z3+Ctz-!%bvHeB_Z|If*S#SRuc6k3@r5xC8peTmD2N_eYhrWVwRG{ z=06&j)ui)8%mjG>|4QP)QPs9(cvwQtpPbk~eLG5Fl=pJ+ABANjW?_DuWWa_+8(8+g z=re@+fXkS#@@r)$c8c(ybo)4q^X#1l`SkiP2H*HJa_w%rE4vrTPE zP4X()N|>5|#~QdcL$u`YuhUvxkvJx49WRmzQ`om9;e zq#w5%v9rJmUT1#*p6PR8z%kc# z!$CuJTr!>%n(KXWo;!hYD=!rsZzr;1eweo#D!wR4%p6?^u$mx48w=sACQQxXfW^cwa>~E#w7$H=ORS0+sIwL!cznp4U z;%2mRCNT-XU<3|f@bXAQZ#fveu?@D6O%& zyk4qS%+`mMg4J&I^(Zu3Lw2AKTu7%w%#Nv%DM$*1-n}3716if`Uj*G((9nFEf-ZEp zDPo|V#tT76e-x7S1NGWn8@}WFfn8$D^swxC+^VX_V(HzhlG>YJ7=y?%Ahh@KEN8aU zP2@D@FVrfzrmWa*<=ThDE&$SBsU6v!{nMDS*1;0x8<7&4M?kiW4(|1Q(WpL5D!?WC zsW7Xa^LP#I27a>2n21dIARbD1PRu3=qjYtL+_9zEuy$qaext-6DQS6JP Q(8{YQee{VQIaCb)58tM{bpQYW diff --git a/tests/utils/sockets.py b/tests/utils/sockets.py deleted file mode 100644 index aaff3f7..0000000 --- a/tests/utils/sockets.py +++ /dev/null @@ -1,31 +0,0 @@ -import socket, numpy as np - -# ip as string, port as int, buf as int -def capture(n=100,ip=None,port=None,buf=4616): - - if ip is None: - print('No IP') - return() - - if port is None: - print('No port') - return() - - sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM) - sock.bind((ip,port)) - - captured=0 - packs = [] - while captured Date: Fri, 12 Jul 2024 06:35:38 -0700 Subject: [PATCH 30/30] mid merge --- include/CMakeLists.txt | 2 + include/cuda_interface.h | 2 +- include/dsaX.h | 40 ++- include/dsaX_api.h | 36 +++ include/dsaX_def.h | 7 + include/dsaX_malloc.h | 113 +++++++ include/dsaX_ptr.h | 102 ++++++ include/enums.h | 11 + src/CMakeLists.txt | 5 +- src/correlator.cpp | 3 +- src/cuda_handles.cu | 8 +- src/cuda_interface.cu | 89 ++++-- src/dsaX_api.cu | 43 +++ src/dsaX_ptr.cpp | 155 ++++++++++ src/interface.cpp | 3 + src/malloc.cu | 631 ++++++++++++++++++++++++++++++++++++++ tests/correlator_test.cpp | 10 +- 17 files changed, 1220 insertions(+), 40 deletions(-) create mode 100644 include/dsaX_api.h create mode 100644 include/dsaX_malloc.h create mode 100644 include/dsaX_ptr.h create mode 100644 src/dsaX_api.cu create mode 100644 src/dsaX_ptr.cpp create mode 100644 src/malloc.cu diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 65ddb04..9a7cbbd 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -6,6 +6,8 @@ set(DSA_XENGINE_HEADERS # cmake-format: sortable dsaX.h dsaX_def.h + dsaX_malloc.h + dsaX_ptr.h fast_time_domain.h cuda_interface.h cuda_handles.h diff --git a/include/cuda_interface.h b/include/cuda_interface.h index 6ae59e2..42043e2 100644 --- a/include/cuda_interface.h +++ b/include/cuda_interface.h @@ -7,7 +7,7 @@ #include "dsaX.h" void dsaXInitCuda(int dev); -void dsaXDestroyCuda(int dev); +void dsaXDestroyCuda(); void initBLASCuda(); void destroyBLASCuda(); diff --git a/include/dsaX.h b/include/dsaX.h index ff2772c..f370bc0 100644 --- a/include/dsaX.h +++ b/include/dsaX.h @@ -11,20 +11,40 @@ // Uncomment to try new pure cuBLAS //#define OLD_BLAS -// required to prevent overflow in corr matrix multiply -#define halfFac 4 - -// beam sep -#define sep 1.0 // arcmin - +/** + * Initialize the library. This function will initialise + * a device if using CUDA and any BLAS libraries that are + * enabled, such as cublas. + * @param[in] device_ordinal The GPU device to init + */ void dsaXInit(int device_ordinal = -1); +/** + * Finalize the library. This function will finalize + * a device if using CUDA and any BLAS libraries that are + * enabled, such as cublas. It will also dump any statistics + * collected, such as performance metrics. + */ void dsaXEnd(); - - +/** + * This function will allocate pinned device memory of the + * given size in bytes, and return a void pointer to that + * memory. The user may delete the memory safely in their + * application code. + * @param[in] size The byte size of pinned memory to be allocated + * by dsaX. + */ void *dsaXHostRegister(size_t size); +/** + * This function allows the user to inspect the (4b,4b) char sized + * complex data at byte address i on the host. If 'non-zero' is true + * then the complex element will print only if either the real + * or imaginary element is non-zero. Useful for checking if + * an array is populated. + * @param[in] input The (4b,4b) char input array + * @param[in] i The ith element of the array + * @param[in] non-zero If true, print only elements with non-zero values + */ void inspectPackedData(char input, int i, bool non_zero = false); - -void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param); diff --git a/include/dsaX_api.h b/include/dsaX_api.h new file mode 100644 index 0000000..3767600 --- /dev/null +++ b/include/dsaX_api.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "enums.h" + +#define STRINGIFY__(x) #x +#define __STRINGIFY__(x) STRINGIFY__(x) + +/** + @brief Wrapper around cudaMemcpy or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy +*/ +void dsaXMemcpy_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const char *func, const char *file, + const char *line); + +/** + @brief Wrapper around cudaMemcpyAsync or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy + @param[in] stream Stream to issue copy +*/ +void dsaXMemcpyAsync_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const cudaStream_t &stream, + const char *func, const char *file, const char *line); + + +#define dsaXMemcpy(dst, src, count, kind) \ + ::dsaXMemcpy_(dst, src, count, kind, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__)) + +#define dsaXMemcpyAsync(dst, src, count, kind, stream) \ + ::dsaXMemcpyAsync_(dst, src, count, kind, stream, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__)) diff --git a/include/dsaX_def.h b/include/dsaX_def.h index 257f493..5b3af78 100644 --- a/include/dsaX_def.h +++ b/include/dsaX_def.h @@ -91,3 +91,10 @@ #define NBMS 256 #define P_SIZE 4108 #define NWAIT 100000 + +// required to prevent overflow in corr matrix multiply +#define halfFac 4 + +// beam sep +#define sep 1.0 // arcmin + diff --git a/include/dsaX_malloc.h b/include/dsaX_malloc.h new file mode 100644 index 0000000..04d24b0 --- /dev/null +++ b/include/dsaX_malloc.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include // for getpagesize() +#include // for backtrace +#include // for std::map + +#include +#include + + +using namespace std; + +// strip path from __FILE__ +// DMH: Place somewhere more sensible when working +constexpr const char *str_end(const char *str) { return *str ? str_end(str + 1) : str; } +constexpr bool str_slant(const char *str) { return *str == '/' ? true : (*str ? str_slant(str + 1) : false); } +constexpr const char *r_slant(const char *str) { return *str == '/' ? (str + 1) : r_slant(str - 1); } +constexpr const char *file_name(const char *str) { return str_slant(str) ? r_slant(str_end(str)) : str; } + +// Define wrappers around function. May wish to place _ +// methods in a dsaX namespace later +void *pinned_malloc_(const char *func, const char *file, int line, size_t size); +#define pinned_malloc(size) pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *device_malloc_(const char *func, const char *file, int line, size_t size); +#define device_malloc(size) device_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size); +#define device_pinned_malloc(size) device_pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *safe_malloc_(const char *func, const char *file, int line, size_t size); +#define safe_malloc(size) safe_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *mapped_malloc_(const char *func, const char *file, int line, size_t size); +#define mapped_malloc(size) mapped_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void *managed_malloc_(const char *func, const char *file, int line, size_t size); +#define managed_malloc(size) managed_malloc_(__func__, file_name(__FILE__), __LINE__, size) + +void managed_free_(const char *func, const char *file, int line, void *ptr); +#define managed_free(ptr) managed_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void device_free_(const char *func, const char *file, int line, void *ptr); +#define device_free(ptr) device_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void device_pinned_free_(const char *func, const char *file, int line, void *ptr); +#define device_pinned_free(ptr) device_pinned_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +void host_free_(const char *func, const char *file, int line, void *ptr); +#define host_free(ptr) host_free_(__func__, file_name(__FILE__), __LINE__, ptr) + +/* + @brief Get device view of a host-mapped pointer +*/ +void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr); +#define get_mapped_device_pointer(ptr) get_mapped_device_pointer_(__func__, file_name(__FILE__), __LINE__, ptr) + +// Create a mem_pool namespace to differentiate +// bewtween regular memory management methods +// and those utilising memory pooling +namespace mem_pool { + + /** + @brief Initialize the memory pool allocator + */ + void init(); + + /** + @brief Allocate device-memory. If free pre-existing allocation exists + reuse this. + @param size Size of allocation + @return Pointer to allocated memory + */ + void *device_malloc_(const char *func, const char *file, int line, size_t size); + + /** + @brief Virtual free of pinned-memory allocation. + @param ptr Pointer to be (virtually) freed + */ + void device_free_(const char *func, const char *file, int line, void *ptr); + + /** + @brief Allocate pinned-memory. + If a free pre-existing allocation exists, reuse this. + @param size Size of allocation + @return Pointer to allocated memory + */ + void *pinned_malloc_(const char *func, const char *file, int line, size_t size); + + /** + @brief Virtual free of pinned-memory allocation. + @param ptr Pointer to be (virtually) freed + */ + void pinned_free_(const char *func, const char *file, int line, void *ptr); + + /** + @brief Free all outstanding device-memory allocations. + */ + void flush_device(); + + /** + @brief Free all outstanding pinned-memory allocations. + */ + void flush_pinned(); +} + +#define pool_device_malloc(size) mem_pool::device_malloc_(__func__, __FILE__, __LINE__, size) +#define pool_device_free(ptr) mem_pool::device_free_(__func__, __FILE__, __LINE__, ptr) +#define pool_pinned_malloc(size) mem_pool::pinned_malloc_(__func__, __FILE__, __LINE__, size) +#define pool_pinned_free(ptr) mem_pool::pinned_free_(__func__, __FILE__, __LINE__, ptr) + diff --git a/include/dsaX_ptr.h b/include/dsaX_ptr.h new file mode 100644 index 0000000..de452f0 --- /dev/null +++ b/include/dsaX_ptr.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include "dsaX_malloc.h" + +/** + Object that stores a memory allocation with different views for + host or device. Depending on the nature of the underlying memory + type, both views may not be defined + + type defined views + DSAX_MEMORY_DEVICE device only + DSAX_MEMORY_DEVICE_PINNED device only + DSAX_MEMORY_HOST host only + DSAX_MEMORY_HOST_PINNED both + DSAX_MEMORY_MAPPED both (pinned to host) + DSAX_MEMORY_MANAGED both +*/ +class dsaX_ptr +{ + friend std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr); + dsaXMemoryType type = DSA_MEMORY_INVALID; /** Memory type of the allocation */ + size_t size = 0; /** Size of the allocation */ + bool pool = false; /** Is the allocation is pooled */ + void *device = nullptr; /** Device-view of the allocation */ + void *host = nullptr; /** Host-view of the allocation */ + bool reference = false; /** Is this a reference to another allocation */ + + /** + @brief Internal deallocation routine + */ + void destroy(); + +public: + dsaX_ptr() = default; + dsaX_ptr(dsaX_ptr &&) = default; + dsaX_ptr &operator=(dsaX_ptr &&); + dsaX_ptr(const dsaX_ptr &) = delete; + dsaX_ptr &operator=(const dsaX_ptr &) = delete; + + /** + @brief Constructor for dsaX_ptr + @param[in] type The memory type of the allocation + @param[in] size The size of the allocation + @param[in] pool Whether the allocation should be in the memory pool (default is true) + */ + dsaX_ptr(dsaXMemoryType type, size_t size, bool pool = true); + + /** + @brief Constructor for dsaX_ptr where we are wrapping a non-owned pointer + @param[in] ptr Raw base pointer + @param[in] type The memory type of the allocation + */ + dsaX_ptr(void *ptr, dsaXMemoryType type); + + /** + @brief Destructor for the dsaX_ptr + */ + virtual ~dsaX_ptr(); + + /** + @brief Specialized exchange function to use in place of + std::exchange when exchanging dsaX_ptr objects: moves obj to + *this, and moves new_value to obj + @param[in,out] obj + @param[in] new_value New value for obj to take + */ + void exchange(dsaX_ptr &obj, dsaX_ptr &&new_value); + + /** + @return Returns true if allocation is visible to the device + */ + bool is_device() const; + + /** + @return Returns true if allocation is visible to the host + */ + bool is_host() const; + + /** + Return view of the pointer. For mapped memory we return the device view. + */ + void *data() const; + + /** + Return the device view of the pointer + */ + void *data_device() const; + + /** + Return the host view of the pointer + */ + void *data_host() const; + + /** + Return if the instance is a reference rather than an allocation + */ + bool is_reference() const; +}; + +std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr); + diff --git a/include/enums.h b/include/enums.h index 607d9d3..aa86573 100644 --- a/include/enums.h +++ b/include/enums.h @@ -2,6 +2,16 @@ #define DSA_INVALID_ENUM (-0x7fffffff - 1) +typedef enum dsaXMemoryType_s { + DSA_MEMORY_DEVICE, + DSA_MEMORY_DEVICE_PINNED, + DSA_MEMORY_HOST, + DSA_MEMORY_HOST_PINNED, + DSA_MEMORY_MAPPED, + DSA_MEMORY_MANAGED, + DSA_MEMORY_INVALID = DSA_INVALID_ENUM +} dsaXMemoryType; + typedef enum dsaXError_t { DSA_SUCCESS = 0, DSA_ERROR = 1, @@ -63,3 +73,4 @@ typedef enum dsaXMemcpyKind_s { dsaXMemcpyDeviceToDeviceAsync = 7, dsaXMemcpyInvalid = DSA_INVALID_ENUM } dsaXMemcpyKind; + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 67f8543..de05a16 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,11 +13,14 @@ endif() set(DSAX_OBJS cuda_interface.cu cublas_interface.cu + malloc.cu + dsaX_ptr.cpp cuda_handles.cu magma_interface.cu blas_interface.cpp beamformer.cpp dsaX_base.cpp + correlator.cpp interface.cpp utils.cpp @@ -65,7 +68,7 @@ add_library(DSA_XENGINE::dsaX ALIAS dsaX) target_sources(dsaX PRIVATE $ ${DSAX_CU_OBJS}) if(CUDAToolkit_FOUND) - target_link_libraries(dsaX INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY}) + target_link_libraries(dsaX INTERFACE CUDA::cuda_driver CUDA::cudart_static ${CUDA_cublas_LIBRARY}) endif() if(DSA_XENGINE_ENABLE_PSRDADA) diff --git a/src/correlator.cpp b/src/correlator.cpp index e45595d..2662e58 100644 --- a/src/correlator.cpp +++ b/src/correlator.cpp @@ -188,7 +188,7 @@ void Correlator::compute(void *output, void *input) { dsaXDeviceSynchronize(); } - +/* // correlator function // workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host // DMH: CUDA references excised. Make me a class @@ -282,3 +282,4 @@ void dcorrelator(corr_handle *d) { // reorder output data reorderCorrOutput(d); } +*/ diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu index 9b65281..1b756d0 100644 --- a/src/cuda_handles.cu +++ b/src/cuda_handles.cu @@ -10,10 +10,10 @@ using namespace std; //------------------------- void init_streams(unsigned int n_streams) { - if(n_streams < 2 || n_streams > 9) { - cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl; - exit(0); - } + //if(n_streams < 2 || n_streams > 9) { + //cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl; + //exit(0); + //} if(!stream_init) { streams.reserve(n_streams); diff --git a/src/cuda_interface.cu b/src/cuda_interface.cu index 51ee957..854b75c 100644 --- a/src/cuda_interface.cu +++ b/src/cuda_interface.cu @@ -5,10 +5,12 @@ #include "cuda_interface.h" #include "cuda_kernels.h" #include "cuda_handles.h" +// DMH: Everything in this file is CUDA aware. -using namespace std; +//#include "dsaX_malloc.h" +#include "dsaX_ptr.h" -// DMH: Everything in this file is CUDA aware. +using namespace std; __global__ void deviceInspectHalfCI(half *input, int stage) { int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -36,8 +38,8 @@ void destroyStreamsCuda(){ destroy_streams(); } -void dsaXDestroyCuda(int dev){ - // +void dsaXDestroyCuda(){ + cudaDeviceReset(); } void *dsaXHostRegisterCuda(size_t size) { @@ -55,16 +57,34 @@ void *dsaXHostRegisterCuda(size_t size) { void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) { // for correlator - cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); - cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); - cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); - cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); - cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams); - cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); - cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); - cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); - cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + + cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + //dsaX_ptr ptr = dsaX_ptr(DSA_MEMORY_DEVICE, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams, true); + //cout << &ptr << endl; + + //d->d_input = + + cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + //cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams); + cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + //cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + //cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams); + + // Total device memeory + uint64_t mem_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams; + mem_size += sizeof(int)*NBASE; + + cout << "mem_size = " << mem_size/pow(1024,3) << " GB" << endl; + //exit(0); // DMH: fix me cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE); } @@ -343,18 +363,45 @@ void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) { sum_beam<<>>(input, output); } +// CUDA API wrappers +// DMH: Wrap all these calls around a CHECK_ERROR to save on +// lines of code +void dsaXDeviceSynchronizeCuda() { + + cudaError error = cudaSuccess; + cudaDeviceSynchronize(); + if(error != cudaSuccess) { + cudaGetLastError(); + exit(0); + } +} + void dsaXmemsetCuda(void *array, int ch, size_t n){ - cudaMemset(array, ch, n); + + cudaError error = cudaSuccess; + error = cudaMemset(array, ch, n); + if(error != cudaSuccess) { + cudaGetLastError(); + exit(0); + } + } -void dsaXDeviceSynchronizeCuda() { - cudaDeviceSynchronize(); +void dsaXmallocCuda(void *array, size_t array_length){ + + // for correlator + //cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams); + //cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + //cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams); + } void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){ cudaError error = cudaSuccess; cudaStream_t str = get_stream(stream); + + cout << "kind = " << dsaXMemcpyHostToHost << endl; switch(kind) { case dsaXMemcpyHostToHost: @@ -384,6 +431,12 @@ void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind ki default: std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl; } - if(error != cudaSuccess) cudaGetLastError(); + + if(error != cudaSuccess) { + const char *string = cudaGetErrorString(error); + //cudaGetLastError(); + //cudaGetErrorString(&string); + printf("dsaXmemcpyCuda failed with error %s\n", string); + exit(0); + } } - diff --git a/src/dsaX_api.cu b/src/dsaX_api.cu new file mode 100644 index 0000000..8f26a49 --- /dev/null +++ b/src/dsaX_api.cu @@ -0,0 +1,43 @@ + + + +void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, + const char *line) + { + if (count == 0) return; + QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); + } + + +void dsaMemcpyAsync_(void *dst, const void *src, size_t count, dsaMemcpyKind kind, const qudaStream_t &stream, + const char *func, const char *file, const char *line) + { + if (count == 0) return; + + if (kind == qudaMemcpyDeviceToDevice) { + QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), stream, true, func, file, line); + } else { +#ifdef USE_DRIVER_API + switch (kind) { + case qudaMemcpyDeviceToHost: + PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_D2H_ASYNC); + break; + case qudaMemcpyHostToDevice: + PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_H2D_ASYNC); + break; + case qudaMemcpyDeviceToDevice: + PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)), + QUDA_PROFILE_MEMCPY_D2D_ASYNC); + break; + case qudaMemcpyDefault: + PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)), + QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC); + break; + default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind); + } +#else + PROFILE(cudaMemcpyAsync(dst, src, count, qudaMemcpyKindToAPI(kind), get_stream(stream)), + kind == qudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC); +#endif + } + } diff --git a/src/dsaX_ptr.cpp b/src/dsaX_ptr.cpp new file mode 100644 index 0000000..702654d --- /dev/null +++ b/src/dsaX_ptr.cpp @@ -0,0 +1,155 @@ +#include +#include "dsaX_ptr.h" + +dsaX_ptr::dsaX_ptr(dsaXMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool) { + if (pool && (type != DSA_MEMORY_DEVICE && type != DSA_MEMORY_HOST_PINNED && type != DSA_MEMORY_HOST)) { + printf("dsaX ERROR: Memory pool not available for memory type %d", type); + exit(0); + } + + if (size > 0) { + switch (type) { + case DSA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break; + case DSA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break; + case DSA_MEMORY_HOST: host = safe_malloc(size); break; + case DSA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break; + case DSA_MEMORY_MAPPED: + host = mapped_malloc(size); + device = get_mapped_device_pointer(host); + break; + case DSA_MEMORY_MANAGED: + host = managed_malloc(size); + device = host; + break; + default: + printf("dsaX ERROR: Unknown memory type %d", type); + exit(0); + } + } +} + +dsaX_ptr::dsaX_ptr(void *ptr, dsaXMemoryType type) : type(type), reference(true) { + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + device = ptr; + host = nullptr; + break; + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: + device = nullptr; + host = ptr; + break; + case DSA_MEMORY_MANAGED: + device = ptr; + host = ptr; + break; + default: + printf("dsaX ERROR: Unsupported memory type %d", type); + exit(0); + } +} + +dsaX_ptr &dsaX_ptr::operator=(dsaX_ptr &&other) { + if (&other != this) { + if (size > 0) { + printf("dsaX ERROR: Cannot move to already initialized dsaX_ptr"); + } + type = std::exchange(other.type, DSA_MEMORY_INVALID); + size = std::exchange(other.size, 0); + pool = std::exchange(other.pool, false); + device = std::exchange(other.device, nullptr); + host = std::exchange(other.host, nullptr); + } + return *this; +} + +void dsaX_ptr::destroy() { + if (size > 0) { + switch (type) { + case DSA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; + case DSA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break; + case DSA_MEMORY_HOST: host_free(host); break; + case DSA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break; + case DSA_MEMORY_MAPPED: host_free(host); break; + default: + printf("Unknown memory type %d", type); + exit(0); + } + } + + size = 0; + device = nullptr; + host = nullptr; +} + +dsaX_ptr::~dsaX_ptr() { + destroy(); +} + +void dsaX_ptr::exchange(dsaX_ptr &obj, dsaX_ptr &&new_value) { + destroy(); + *this = std::move(obj); + obj = std::move(new_value); +} + +bool dsaX_ptr::is_device() const { + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + case DSA_MEMORY_MAPPED: + case DSA_MEMORY_MANAGED: return true; + default: return false; + } +} + +bool dsaX_ptr::is_host() const { + switch (type) { + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: + case DSA_MEMORY_MANAGED: return true; + default: return false; + } +} + +void *dsaX_ptr::data() const { + void *ptr = nullptr; + + switch (type) { + case DSA_MEMORY_DEVICE: + case DSA_MEMORY_DEVICE_PINNED: + case DSA_MEMORY_MAPPED: + case DSA_MEMORY_MANAGED: ptr = device; break; + case DSA_MEMORY_HOST: + case DSA_MEMORY_HOST_PINNED: ptr = host; break; + default: + printf("Unknown memory type %d", type); + exit(0); + } + + return ptr; +} + +void *dsaX_ptr::data_device() const { + if (!device) { + printf("dsaX ERROR: Device view not defined"); + exit(0); + } + return device; +} + +void *dsaX_ptr::data_host() const { + if (!host) { + printf("dsaX ERROR: Host view not defined"); + exit(0); + } + return host; +} + +bool dsaX_ptr::is_reference() const { return reference; } + +std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr) { + output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool + << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}"; + return output; +} diff --git a/src/interface.cpp b/src/interface.cpp index 31dc832..41e7caf 100644 --- a/src/interface.cpp +++ b/src/interface.cpp @@ -44,6 +44,9 @@ void dsaXEnd() { // output metrics cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl; cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl; +#if DSA_XENGINE_TARGET_CUDA + dsaXDestroyCuda(); +#endif } void *dsaXHostRegister(size_t size) { diff --git a/src/malloc.cu b/src/malloc.cu new file mode 100644 index 0000000..55bad3f --- /dev/null +++ b/src/malloc.cu @@ -0,0 +1,631 @@ +#include "dsaX_malloc.h" + +#include "cuda_headers.h" +//#include "cuda_interface.h" +//#include "cuda_kernels.h" +//#include "cuda_handles.h" +// DMH: Everything in this file is CUDA aware. + +enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, SHMEM, N_ALLOC_TYPE }; + +class MemAlloc +{ + +public: + std::string func; + std::string file; + int line; + size_t size; + size_t base_size; + + MemAlloc() : line(-1), size(0), base_size(0) {} + + MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0) + { + } + + MemAlloc(const MemAlloc &) = default; + MemAlloc(MemAlloc &&) = default; + virtual ~MemAlloc() = default; + MemAlloc &operator=(const MemAlloc &) = default; + MemAlloc &operator=(MemAlloc &&) = default; +}; + +static std::map alloc[N_ALLOC_TYPE]; +static size_t total_bytes[N_ALLOC_TYPE] = {0}; +static size_t max_total_bytes[N_ALLOC_TYPE] = {0}; +static size_t total_host_bytes, max_total_host_bytes; +static size_t total_pinned_bytes, max_total_pinned_bytes; + +size_t device_allocated() { return total_bytes[DEVICE]; } + +size_t pinned_allocated() { return total_bytes[PINNED]; } + +size_t mapped_allocated() { return total_bytes[MAPPED]; } + +size_t managed_allocated() { return total_bytes[MANAGED]; } + +size_t host_allocated() { return total_bytes[HOST]; } + +size_t device_allocated_peak() { return max_total_bytes[DEVICE]; } + +size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; } + +size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; } + +size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; } + +size_t host_allocated_peak() { return max_total_bytes[HOST]; } + +static void print_trace(void) +{ + void *array[10]; + size_t size; + char **strings; + size = backtrace(array, 10); + strings = backtrace_symbols(array, size); + printf("Obtained %zd stack frames.\n", size); + for (size_t i = 0; i < size; i++) printf("%s\n", strings[i]); + free(strings); +} + +static void print_alloc_header() +{ + printf("Type Pointer Size Location\n"); + printf("----------------------------------------------------------\n"); +} + +static void print_alloc(AllocType type) +{ + const char *type_str[] = {"Device", "Device Pinned", "Host ", "Pinned", "Mapped", "Managed", "Shmem "}; + + for (auto entry : alloc[type]) { + void *ptr = entry.first; + MemAlloc a = entry.second; + printf("%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(), + a.file.c_str(), a.line); + } +} + +static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr) +{ + total_bytes[type] += a.base_size; + if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; } + if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { + total_host_bytes += a.base_size; + if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; } + } + if (type == PINNED || type == MAPPED) { + total_pinned_bytes += a.base_size; + if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; } + } + alloc[type][ptr] = a; +} + +static void track_free(const AllocType &type, void *ptr) +{ + size_t size = alloc[type][ptr].base_size; + total_bytes[type] -= size; + if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { total_host_bytes -= size; } + if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; } + alloc[type].erase(ptr); +} + +void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *host) +{ + void *device; + auto error = cudaHostGetDevicePointer(&device, const_cast(host), 0); + if (error != cudaSuccess) { + printf("dsaX ERROR: cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line, + func); + } + return device; +} + +bool use_managed_memory() { + static bool managed = false; + static bool init = false; + + if (!init) { + char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY"); + if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) { + printf("dsaX ERROR: Using managed memory for CUDA allocations"); + managed = true; + + //if (!device::managed_memory_supported()) printf("dsaX WARNING: Target device does not report supporting managed memory"); + } + + init = true; + } + + return managed; +} + +/** + * Free device memory allocated with device_malloc(). This function + * should only be called via the device_free() macro, defined in + * malloc_quda.h + */ +void managed_free_(const char *func, const char *file, int line, void *ptr) { + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[MANAGED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + cudaError_t err = cudaFree(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MANAGED, ptr); +} + + +/** + * Free host memory allocated with safe_malloc(), pinned_malloc(), + * or mapped_malloc(). This function should only be called via the + * host_free() macro, defined in dsaX_malloc.h + */ +void host_free_(const char *func, const char *file, int line, void *ptr) { + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (alloc[HOST].count(ptr)) { + track_free(HOST, ptr); + free(ptr); + } else if (alloc[PINNED].count(ptr)) { + cudaError_t err = cudaHostUnregister(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(PINNED, ptr); + free(ptr); + } else if (alloc[MAPPED].count(ptr)) { +#ifdef HOST_ALLOC + cudaError_t err = cudaFreeHost(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free host memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MAPPED, ptr); +#else + cudaError_t err = cudaHostUnregister(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(MAPPED, ptr); + free(ptr); +#endif + } else { + printf("dsaX ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func); + print_trace(); + printf("dsaX ERROR: Aborting"); + exit(0); + } +} + + +/** + * Perform a standard cudaMalloc() with error-checking. This + * function should only be called via the device_malloc() macro, + * defined in dsaX_malloc.h + */ +void *device_malloc_(const char *func, const char *file, int line, size_t size) { + + if (use_managed_memory()) return managed_malloc_(func, file, line, size); + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + cudaError_t err = cudaMalloc(&ptr, size); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + + // DMH: GET ON THIS! + //if (is_prefetch_enabled()) dsaXMemPrefetchAsync(ptr, size, DSA_CUDA_FIELD_LOCATION, get_default_stream()); + track_malloc(DEVICE, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + +/** + * Free device memory allocated with device_malloc(). This function + * should only be called via the device_free() macro, defined in + * dsaX_malloc.h + */ +void device_free_(const char *func, const char *file, int line, void *ptr) { + + if (use_managed_memory()) { + managed_free_(func, file, line, ptr); + return; + } + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + + cudaError_t err = cudaFree(ptr); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + + track_free(DEVICE, ptr); +} + +/** + * Free device memory allocated with device_pinned malloc(). This + * function should only be called via the device_pinned_free() + * macro, defined in dsaX_malloc.h + */ +void device_pinned_free_(const char *func, const char *file, int line, void *ptr) { + + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed. + //if (!comm_peer2peer_present()) { + //device_free_(func, file, line, ptr); + //return; + //} + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE_PINNED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + CUresult err = cuMemFree((CUdeviceptr)ptr); + if (err != CUDA_SUCCESS) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(DEVICE_PINNED, ptr); +} + + +/** + * Under CUDA 4.0, cudaHostRegister seems to require that both the + * beginning and end of the buffer be aligned on page boundaries. + * This local function takes care of the alignment and gets called + * by pinned_malloc_() and mapped_malloc_() + */ +static void *aligned_malloc(MemAlloc &a, size_t size) { + void *ptr = nullptr; + + a.size = size; + + // we need to manually align to page boundaries to allow us to bind a texture to mapped memory + static int page_size = 2 * getpagesize(); + a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size + int align = posix_memalign(&ptr, page_size, a.base_size); + if (!ptr || align != 0) { + printf("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line, + a.func.c_str()); + exit(0); + } + return ptr; +} + +/** + * Perform a standard malloc() with error-checking. This function + * should only be called via the safe_malloc() macro, defined in + * malloc_quda.h + */ +void *safe_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + a.size = a.base_size = size; + + void *ptr = malloc(size); + if (!ptr) { + printf("dsaX ERROR: Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(HOST, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, size); +#endif + return ptr; +} + +/** + * Allocate page-locked ("pinned") host memory, and map it into the + * GPU address space. This function should only be called via the + * mapped_malloc() macro, defined in malloc_quda.h + */ +void *mapped_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + + void *ptr = aligned_malloc(a, size); + cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped | cudaHostRegisterPortable); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(MAPPED, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, a.base_size); +#endif + return ptr; +} + +/** + * Perform a standard cudaMallocManaged() with error-checking. This + * function should only be called via the managed_malloc() macro, + * defined in dsaX_malloc.h + */ +void *managed_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + cudaError_t err = cudaMallocManaged(&ptr, size); + if (err != cudaSuccess) { + printf("dsaX ERROR: Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(MANAGED, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + + +/** + * Perform a cuMemAlloc with error-checking. This function is to + * guarantee a unique memory allocation on the device. This + * should only be called via the device_pinned_malloc() macro, + * defined in dsaX_malloc.h. + */ +void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size) { + + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed. + //if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size); + + MemAlloc a(func, file, line); + void *ptr; + + a.size = a.base_size = size; + + CUresult err = cuMemAlloc((CUdeviceptr *)&ptr, size); + if (err != CUDA_SUCCESS) { + printf("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(DEVICE_PINNED, a, ptr); +#ifdef HOST_DEBUG + cudaMemset(ptr, 0xff, size); +#endif + return ptr; +} + + +/** + * Allocate page-locked ("pinned") host memory. This function + * should only be called via the pinned_malloc() macro, defined in + * dsaX_malloc.h + * + * Note that we do not rely on cudaHostAlloc(), since buffers + * allocated in this way have been observed to cause problems when + * shared with MPI via GPU Direct on some systems. + */ +void *pinned_malloc_(const char *func, const char *file, int line, size_t size) { + + MemAlloc a(func, file, line); + void *ptr = aligned_malloc(a, size); + + cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault); + if (err != cudaSuccess) { + printf("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func); + exit(0); + } + track_malloc(PINNED, a, ptr); +#ifdef HOST_DEBUG + memset(ptr, 0xff, a.base_size); +#endif + return ptr; +} + +namespace mem_pool { + + /** Cache of inactive pinned-memory allocations. We cache pinned + memory allocations so that fields can reuse these with minimal + overhead. + */ + static std::multimap pinnedCache; + + /** Sizes of active pinned-memory allocations. For convenience, + we keep track of the sizes of active allocations (i.e., those not + in the cache). + */ + static std::map pinnedSize; + + /** Cache of inactive device-memory allocations. We cache pinned + memory allocations so that fields can reuse these with minimal + overhead. + */ + static std::multimap deviceCache; + + /** Sizes of active device-memory allocations. For convenience, + we keep track of the sizes of active allocations (i.e., those not + in the cache). + */ + static std::map deviceSize; + + static bool pool_init = false; + + /** whether to use a memory pool allocator for device memory */ + static bool device_memory_pool = true; + + /** whether to use a memory pool allocator for pinned memory */ + static bool pinned_memory_pool = true; + + void init() { + if (!pool_init) { + // device memory pool + char *enable_device_pool = getenv("DSAX_ENABLE_DEVICE_MEMORY_POOL"); + if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) { + printf("dsaX Warning: Using device memory pool allocator"); + device_memory_pool = true; + } else { + printf("dsaX Warning: Not using device memory pool allocator"); + device_memory_pool = false; + } + + // pinned memory pool + char *enable_pinned_pool = getenv("DSAX_ENABLE_PINNED_MEMORY_POOL"); + if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) { + printf("dsaX Warning: Using pinned memory pool allocator"); + pinned_memory_pool = true; + } else { + printf("dsaX Warning: Not using pinned memory pool allocator"); + pinned_memory_pool = false; + } + pool_init = true; + } + } + void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes) { + void *ptr = nullptr; + if (pinned_memory_pool) { + if (pinnedCache.empty()) { + ptr = pinned_malloc_(func, file, line, nbytes); + } else { + auto it = pinnedCache.lower_bound(nbytes); + if (it != pinnedCache.end()) { // sufficiently large allocation found + nbytes = it->first; + ptr = it->second; + pinnedCache.erase(it); + } else { // sacrifice the smallest cached allocation + it = pinnedCache.begin(); + ptr = it->second; + pinnedCache.erase(it); + host_free(ptr); + ptr = pinned_malloc_(func, file, line, nbytes); + } + } + pinnedSize[ptr] = nbytes; + } else { + ptr = pinned_malloc_(func, file, line, nbytes); + } + return ptr; + } + + void pinned_free_(const char *func, const char *file, int line, void *ptr) { + if (pinned_memory_pool) { + if (!pinnedSize.count(ptr)) { + printf("dsaX Error: Attempt to free invalid pointer"); + exit(0); + } + pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr)); + pinnedSize.erase(ptr); + } else { + host_free_(func, file, line, ptr); + } + } + + void *device_malloc_(const char *func, const char *file, int line, size_t nbytes) { + void *ptr = nullptr; + if (device_memory_pool) { + if (deviceCache.empty()) { + ptr = device_malloc_(func, file, line, nbytes); + } else { + auto it = deviceCache.lower_bound(nbytes); + if (it != deviceCache.end()) { // sufficiently large allocation found + nbytes = it->first; + ptr = it->second; + deviceCache.erase(it); + } else { // sacrifice the smallest cached allocation + it = deviceCache.begin(); + ptr = it->second; + deviceCache.erase(it); + device_free_(func, file, line, ptr); + ptr = device_malloc_(func, file, line, nbytes); + } + } + deviceSize[ptr] = nbytes; + } else { + ptr = device_malloc_(func, file, line, nbytes); + } + return ptr; + } + + /** + * Free device memory allocated with device_pinned malloc(). This + * function should only be called via the device_pinned_free() + * macro, defined in malloc_quda.h + */ + void device_pinned_free_(const char *func, const char *file, int line, void *ptr) { + //DMH: I would think that we will always be using hardware with + // compute >= 2.0, but this can be implemeneted later if needed + //if (!comm_peer2peer_present()) { + //device_free_(func, file, line, ptr); + //return; + //} + + if (!ptr) { + printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + if (!alloc[DEVICE_PINNED].count(ptr)) { + printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func); + exit(0); + } + CUresult err = cuMemFree((CUdeviceptr)ptr); + if (err != CUDA_SUCCESS) { + printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func); + exit(0); + } + track_free(DEVICE_PINNED, ptr); + } + + + void device_free_(const char *func, const char *file, int line, void *ptr) { + if (device_memory_pool) { + if (!deviceSize.count(ptr)) { + printf("dsaX Error: Attempt to free invalid pointer"); + exit(0); + } + deviceCache.insert(std::make_pair(deviceSize[ptr], ptr)); + deviceSize.erase(ptr); + } else { + device_free_(func, file, line, ptr); + } + } + + void flush_pinned() { + if (pinned_memory_pool) { + for (auto it : pinnedCache) { host_free(it.second); } + pinnedCache.clear(); + } + } + + void flush_device() { + if (device_memory_pool) { + for (auto it : deviceCache) { device_free(it.second); } + deviceCache.clear(); + } + } +} // namespace pool diff --git a/tests/correlator_test.cpp b/tests/correlator_test.cpp index 6f8d6df..3cdc699 100644 --- a/tests/correlator_test.cpp +++ b/tests/correlator_test.cpp @@ -13,6 +13,9 @@ using namespace std; // Include this file to access input parameters #include "command_line_params.h" +// Include the dsaX.h header in your application +#include + // Include this file to access test utilities /** * Promote complex char riri... data to planar half rr.. ii.. @@ -110,9 +113,7 @@ template prec test_hermiticity(const prec *C, const int m, const return frob_norm/(m*n*2); } -// Include the dsaX.h header in your application -#include - +/* // The class offers entire file content read/write in single operation class BinaryFileVector : public vector { @@ -180,7 +181,7 @@ class BinaryFileVector : public vector } } }; - +*/ int main(int argc, char **argv) { // Parse command line @@ -297,7 +298,6 @@ int main(int argc, char **argv) { float frob_norm = test_hermiticity((float*)output_data, 96, 96); cout << "Frobenius norm = " << frob_norm << endl; - //cout << "Output peek " << endl; float *p = (float*)output_data;