From 6fee8e2b2ff99fd8717932d7be3ef8d783feb78b Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth26@gmail.com>
Date: Tue, 4 Jun 2024 18:04:07 -0700
Subject: [PATCH 01/30] Added some CMake functionality, missing XGPU, sigproc,
 and fitsio dependencies

---
 CMakeLists.txt                 | 125 +++++++++++++++++++++++++++++++++
 src/CMakeLists.txt             | 119 +++++++++++++++++++++++++++++++
 src/Makefile                   |   1 -
 src/cuda_correlator.cu         |   2 +-
 src/dsaX_beamformer.cu         |   9 ++-
 src/dsaX_beamformer_offline.cu |   4 +-
 src/dsaX_beamformer_passon.cu  |  10 +--
 src/dsaX_bfCorr.cu             |   4 +-
 src/dsaX_capture.c             |   4 +-
 src/dsaX_capture_manythread.c  |   6 +-
 src/dsaX_capture_thread.c      |   8 +--
 src/dsaX_copydb.c              |   6 +-
 src/dsaX_fake.c                |   8 +--
 src/dsaX_filTrigger.c          |  16 ++---
 src/dsaX_fluff.c               |   6 +-
 src/dsaX_merge.c               |   8 +--
 src/dsaX_nicdb.c               |   4 +-
 src/dsaX_reorder.c             |   6 +-
 src/dsaX_reorder_raw.c         |  11 +--
 src/dsaX_simplesplit.c         |   8 +--
 src/dsaX_split.c               |  10 +--
 src/dsaX_splitup.c             |   6 +-
 src/dsaX_store.c               |   6 +-
 src/dsaX_trigger.c             |  18 ++---
 src/dumpfil.c                  |   4 +-
 src/fil2dada.c                 |  33 +++++----
 src/test_read.c                |   2 +-
 src/test_write.c               |   9 ++-
 utils/packet.out               | Bin 4608 -> 4608 bytes
 29 files changed, 357 insertions(+), 96 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 src/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..66682b6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,125 @@
+####################################################################################
+# START 1. Basic setup for cmake
+####################################################################################
+# basic setup for cmake
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+if(POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON)
+set(CMAKE_COLOR_MAKEFILE ON)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+# Disable gnu exentions
+set(CMAKE_CXX_EXTENSIONS ON)
+
+# Define the project
+project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C)
+
+# DSA_XENGINE may be built to run using CUDA. Future version may be
+# written for HIP or SYCL, which we call the
+# Target type. By default, the target is CUDA.
+if(DEFINED ENV{DSA_XENGINE_TARGET})
+  set(DEFTARGET $ENV{DSA_XENGINE_TARGET})
+else()
+  set(DEFTARGET "CUDA")
+endif()
+
+set(VALID_TARGET_TYPES CUDA) #HIP SYCL
+set(DSA_XENGINE_TARGET_TYPE
+  "${DEFTARGET}"
+  CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
+set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS CUDA)
+
+# CUDA specific part of CMakeLists
+#set(CMAKE_CUDA_EXTENSIONS OFF)
+find_package(CUDAToolkit REQUIRED)
+
+if(DEFINED ENV{DSA_XENGINE_GPU_ARCH})
+  set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH})
+else()
+  set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_70)
+endif()
+if(NOT DSA_XENGINE_GPU_ARCH)
+  message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}")
+endif()
+
+set(DSA_XENGINE_GPU_ARCH
+  ${DSA_XENGINE_DEFAULT_GPU_ARCH}
+  CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
+set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
+set(DSA_XENGINE_GPU_ARCH_SUFFIX
+  ""
+  CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
+set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
+#set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
+mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
+mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+
+string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE)
+list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
+
+if(TARGET_TYPE_VALID LESS 0)
+  message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
+endif()
+
+# Git
+find_package(Git)
+if(GIT_FOUND)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} show
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    RESULT_VARIABLE IS_GIT_REPOSIITORY
+    OUTPUT_QUIET ERROR_QUIET)
+  if(${IS_GIT_REPOSIITORY} EQUAL 0)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --abbrev=0
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      OUTPUT_VARIABLE GITTAG
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but
+    # that might not always be available
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      COMMAND wc -l
+      OUTPUT_VARIABLE GITCOUNT
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --match 1 --always  --long --dirty
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      OUTPUT_VARIABLE GITVERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+endif(GIT_FOUND)
+
+# EXTERNALS
+include(FetchContent)
+# Get psrdada dependency
+option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
+if(DSA_XENGINE_DOWNLOAD_PSRDADA) 
+  FetchContent_Declare(
+    PSRDada
+    GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code
+    GIT_TAG        008afa70393ae2df11efba0cc8d0b95cda599c02
+    )
+  FetchContent_MakeAvailable(PSRDada)
+endif()
+
+option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON)
+if(DSA_XENGINE_DOWNLOAD_XGPU) 
+  FetchContent_Declare(
+    XGPU
+    GIT_REPOSITORY https://github.com/GPU-correlators/xGPU.git
+    GIT_TAG        7e85bd5da619c026e1bfbb64325ed122323b8854
+    )
+  FetchContent_MakeAvailable(XGPU)
+endif()
+
+
+# Add src
+add_subdirectory(src)
+
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..1b0a548
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,119 @@
+#enable_language(CUDA)
+
+set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+include_directories(${PSRDada_SOURCE_DIR}/src)
+
+add_executable(test_write test_write.c)
+target_link_libraries(test_write ${PSRDada_LIB})
+
+add_executable(test_read test_read.c)
+target_link_libraries(test_read ${PSRDada_LIB})
+
+add_executable(dsaX_trigger dsaX_trigger.c)
+target_link_libraries(dsaX_trigger ${PSRDada_LIB})
+
+add_executable(dsaX_filTrigger dsaX_filTrigger.c)
+target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
+
+# DMH: Has a 'sigproc' dependency, low priority
+if(0)
+  add_executable(splice_offline_beams splice_offline_beams.c)
+  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
+
+  add_executable(dsaX_writeFil dsaX_writeFil.c)
+  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
+  
+  add_executable(dsaX_splice dsaX_splice.c)
+  target_link_libraries(dsaX_splice ${PSRDada_LIB})
+
+  add_executable(gpu_flagger gpu_flagger.cu)
+  target_link_libraries(gpu_flagger ${PSRDada_LIB})
+endif()
+
+add_executable(dsaX_store dsaX_store.c)
+target_link_libraries(dsaX_store ${PSRDada_LIB})
+
+add_executable(dsaX_fluff dsaX_fluff.c)
+target_link_libraries(dsaX_fluff ${PSRDada_LIB})
+
+# DMH: intrinsics compilation error
+#add_executable(dsaX_reorder dsaX_reorder.c)
+#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
+
+# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
+#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
+#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+add_executable(dsaX_nicdb dsaX_nicdb.c)
+target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
+
+add_executable(dsaX_capture dsaX_capture.c)
+target_link_libraries(dsaX_capture ${PSRDada_LIB})
+
+add_executable(dsaX_capture_thread dsaX_capture_thread.c)
+target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
+
+add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
+target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
+
+add_executable(dsaX_split dsaX_split.c)
+target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
+
+add_executable(dsaX_merge dsaX_merge.c)
+target_link_libraries(dsaX_merge ${PSRDada_LIB})
+
+add_executable(dsaX_simplesplit dsaX_simplesplit.c)
+target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
+
+add_executable(dsaX_fake dsaX_fake.c)
+target_link_libraries(dsaX_fake ${PSRDada_LIB})
+
+add_executable(dsaX_splitup dsaX_splitup.c)
+target_link_libraries(dsaX_splitup ${PSRDada_LIB})
+
+add_executable(dsaX_copydb dsaX_copydb.c)
+target_link_libraries(dsaX_copydb ${PSRDada_LIB})
+
+# DMH: fitsio dependency
+if(0)
+  add_executable(dsaX_writevis dsaX_writevis.c)
+  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
+endif()
+
+# DMH: XGPU dependencies
+if(0)
+  add_executable(dsaX_wrangle dsaX_wrangle.c)
+  target_link_libraries(dsaX_wrangle ${PSRDada_LIB})
+  
+  add_executable(dsaX_testdada dsaX_testdada.c)
+  target_link_libraries(dsaX_testdada ${PSRDada_LIB})
+  
+  add_executable(dsaX_bfCorr dsaX_bfCorr.cu)
+  target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+
+  # DMH: Fix CUBE error
+  add_executable(dsaX_xgpu dsaX_xgpu.cu)
+  target_link_libraries(dsaX_xgpu ${PSRDada_LIB})
+
+  add_executable(cuda_correlator cuda_correlator.cu)
+  target_link_libraries(cuda_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+
+endif()
+
+add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
+target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
+
+add_executable(fil2dada fil2dada.c)
+target_link_libraries(fil2dada ${PSRDada_LIB})
+
+add_executable(dumpfil dumpfil.c)
+target_link_libraries(dumpfil ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer dsaX_beamformer.cu)
+target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
+target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
+target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
+
diff --git a/src/Makefile b/src/Makefile
index bbca4e0..0de1991 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -63,7 +63,6 @@ dsaX_reorder.o: dsaX_reorder.c $(CDEPS1)
 dsaX_reorder: dsaX_reorder.o
 	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
 
-
 dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1)
 	$(CC) -c -o $@ $< $(CFLAGS1)
 
diff --git a/src/cuda_correlator.cu b/src/cuda_correlator.cu
index eb0882c..9d9e66d 100644
--- a/src/cuda_correlator.cu
+++ b/src/cuda_correlator.cu
@@ -36,7 +36,7 @@ using std::endl;
 #include "dada_affinity.h"
 #include "ascii_header.h"
 #include "dsaX_def.h"
-#include "cube/cube.h"
+//#include "cube/cube.h"
 #include "xgpu.h"
  
 
diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu
index 5efcfca..afdda70 100644
--- a/src/dsaX_beamformer.cu
+++ b/src/dsaX_beamformer.cu
@@ -30,6 +30,9 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -811,7 +814,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -822,7 +825,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -871,7 +874,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
diff --git a/src/dsaX_beamformer_offline.cu b/src/dsaX_beamformer_offline.cu
index 13eab5e..c122d46 100644
--- a/src/dsaX_beamformer_offline.cu
+++ b/src/dsaX_beamformer_offline.cu
@@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -723,7 +725,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_out = 15*48*512*256;
   char * block;
   block = (char *)malloc(sizeof(char)*block_size);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
   uint64_t nbytes_per_out = block_out / nints;  
diff --git a/src/dsaX_beamformer_passon.cu b/src/dsaX_beamformer_passon.cu
index 7c8c254..818c28a 100644
--- a/src/dsaX_beamformer_passon.cu
+++ b/src/dsaX_beamformer_passon.cu
@@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -721,7 +723,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -732,7 +734,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -743,7 +745,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out2  = dada_hdu_create ();
+  hdu_out2  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out2, out_key2);
   if (dada_hdu_connect (hdu_out2) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -809,7 +811,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
   uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu
index deca0f5..01c45e1 100644
--- a/src/dsaX_bfCorr.cu
+++ b/src/dsaX_bfCorr.cu
@@ -1122,7 +1122,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -1133,7 +1133,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
diff --git a/src/dsaX_capture.c b/src/dsaX_capture.c
index d83d8a9..054e45d 100644
--- a/src/dsaX_capture.c
+++ b/src/dsaX_capture.c
@@ -685,7 +685,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_INFO,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(LOG_INFO,"Created hdu");
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
@@ -861,7 +861,7 @@ int main (int argc, char *argv[]) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
diff --git a/src/dsaX_capture_manythread.c b/src/dsaX_capture_manythread.c
index 06f508a..b9f14bd 100644
--- a/src/dsaX_capture_manythread.c
+++ b/src/dsaX_capture_manythread.c
@@ -427,7 +427,7 @@ void control_thread (void * arg) {
 /* 
  *  Thread to capture data
  */
-void recv_thread(void * arg) {
+int recv_thread(void * arg) {
 
   udpdb_t * udpdb = (udpdb_t *) arg;
   int thread_id = udpdb->thread_id;
@@ -528,7 +528,7 @@ void recv_thread(void * arg) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
@@ -953,7 +953,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(DEBUG,"Created hdu");
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
diff --git a/src/dsaX_capture_thread.c b/src/dsaX_capture_thread.c
index 3cc0c96..49019be 100644
--- a/src/dsaX_capture_thread.c
+++ b/src/dsaX_capture_thread.c
@@ -518,7 +518,7 @@ void control_thread (void * arg) {
 /* 
  *  Thread to capture data
  */
-void recv_thread(void * arg) {
+int recv_thread(void * arg) {
 
   // set affinity
   const pthread_t pid = pthread_self();
@@ -604,7 +604,7 @@ void recv_thread(void * arg) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
@@ -753,7 +753,7 @@ void recv_thread(void * arg) {
 /* 
  *  Thread to write data
  */
-void write_thread(void * arg) {
+int write_thread(void * arg) {
 
   // set affinity
   const pthread_t pid = pthread_self();
@@ -964,7 +964,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(DEBUG,"Created hdu");
   dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
   if (dada_hdu_connect (hdu_out) < 0) {
diff --git a/src/dsaX_copydb.c b/src/dsaX_copydb.c
index 054ee94..7714038 100644
--- a/src/dsaX_copydb.c
+++ b/src/dsaX_copydb.c
@@ -160,7 +160,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -171,7 +171,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -220,7 +220,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block;
   uint64_t written, block_id;
diff --git a/src/dsaX_fake.c b/src/dsaX_fake.c
index e68f19a..662ea37 100644
--- a/src/dsaX_fake.c
+++ b/src/dsaX_fake.c
@@ -175,7 +175,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -186,7 +186,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -235,7 +235,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t npackets = block_out / 4608;
   char * block, * output_buffer;
@@ -257,7 +257,7 @@ int main (int argc, char *argv[]) {
       fread(packet,4608,1,fin);
       fclose(fin);
 
-      syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      syslog(LOG_INFO,"Read packet, npackets %lu",npackets);
       
       for (int i=0;i<npackets;i++)
 	memcpy(output_buffer+i*4608,packet,4608);
diff --git a/src/dsaX_filTrigger.c b/src/dsaX_filTrigger.c
index d7fa9be..55f95fd 100644
--- a/src/dsaX_filTrigger.c
+++ b/src/dsaX_filTrigger.c
@@ -188,11 +188,11 @@ void control_thread (void * arg) {
       //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
       specnum = tmps/4;
       strcpy(footer_buf,tnam);
-      syslog(LOG_INFO, "control_thread: received command to dump at %llu src %s",specnum,footer_buf);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf);
     }
 	
     if (dump_pending) {
-      syslog(LOG_ERR, "control_thread: BACKED UP - using %llu src %s as next specnum",tmps,tnam);
+      syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam);
       next_specnum = tmps/4;
       strcpy(next_footer_buf,tnam);
     }
@@ -335,7 +335,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DBs
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer");
@@ -495,9 +495,9 @@ int main (int argc, char *argv[]) {
 	  
 	}
 	
-	syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
 	ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
 	fclose(ofile);
 	
 	dumpnum++;
@@ -519,7 +519,7 @@ int main (int argc, char *argv[]) {
       
       // if trigger arrived too late
       if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum);
+	syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
 	
 	bytes_copied=0;
 	dump_pending=0;
@@ -530,14 +530,14 @@ int main (int argc, char *argv[]) {
     }
     
     // update current spec
-    if (DEBUG) syslog(LOG_INFO,"current_specnum %llu",current_specnum);
+    if (DEBUG) syslog(LOG_INFO,"current_specnum %lu",current_specnum);
     current_specnum += specs_per_block;
     
     
     // for exiting
     if (bytes_read < block_size) {
       observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size);
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
     }
     
     // close block for reading
diff --git a/src/dsaX_fluff.c b/src/dsaX_fluff.c
index 141bf51..3e3f2d1 100644
--- a/src/dsaX_fluff.c
+++ b/src/dsaX_fluff.c
@@ -267,7 +267,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -278,7 +278,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -332,7 +332,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_merge.c b/src/dsaX_merge.c
index 0154b80..7866d5f 100644
--- a/src/dsaX_merge.c
+++ b/src/dsaX_merge.c
@@ -255,7 +255,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -266,7 +266,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -277,7 +277,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_in2  = dada_hdu_create ();
+  hdu_in2  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in2, in_key2);
   if (dada_hdu_connect (hdu_in2) < 0) {
     syslog (LOG_ERR,"could not connect to input  buffer2");
@@ -455,7 +455,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block1, * block2, * o1, * o2;
   char * output = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_nicdb.c b/src/dsaX_nicdb.c
index 65cfdcc..df47ebe 100644
--- a/src/dsaX_nicdb.c
+++ b/src/dsaX_nicdb.c
@@ -369,7 +369,7 @@ int main(int argc, char ** argv)
 
   // DADA stuff
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -408,7 +408,7 @@ int main(int argc, char ** argv)
   
   // get block sizes and allocate memory
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out);
+  syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out);
   uint64_t  bytes_read = 0;
   char *output1, *output2;
   output1 = (char *)malloc(sizeof(char)*block_out*bdepth);
diff --git a/src/dsaX_reorder.c b/src/dsaX_reorder.c
index ed0b440..04955da 100644
--- a/src/dsaX_reorder.c
+++ b/src/dsaX_reorder.c
@@ -369,7 +369,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -380,7 +380,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -435,7 +435,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_reorder_raw.c b/src/dsaX_reorder_raw.c
index d1a7ca3..c0f6b0c 100644
--- a/src/dsaX_reorder_raw.c
+++ b/src/dsaX_reorder_raw.c
@@ -28,6 +28,9 @@
 #include "dada_def.h"
 #include "dada_hdu.h"
 #include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
 #include "ipcbuf.h"
 #include "dada_affinity.h"
 #include "ascii_header.h"
@@ -391,7 +394,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -402,7 +405,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -414,7 +417,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -501,7 +504,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_simplesplit.c b/src/dsaX_simplesplit.c
index fb41432..7a80c7e 100644
--- a/src/dsaX_simplesplit.c
+++ b/src/dsaX_simplesplit.c
@@ -193,7 +193,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -204,7 +204,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -216,7 +216,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -298,7 +298,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * o1, * o2;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_split.c b/src/dsaX_split.c
index d5724cd..1361e86 100644
--- a/src/dsaX_split.c
+++ b/src/dsaX_split.c
@@ -135,7 +135,7 @@ void calc_stats(char *input) {
   }
 
   for (int i=0;i<NANT;i++) {
-    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.),sqrt(rmss[2*i+1]/768.));
+    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.0),sqrt(rmss[2*i+1]/768.0));
   }
 
 }
@@ -345,7 +345,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -356,7 +356,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -368,7 +368,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -451,7 +451,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
   uint64_t nints = block_size / block_out;
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * o1, * o2;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_splitup.c b/src/dsaX_splitup.c
index 3a9ab10..32f055d 100644
--- a/src/dsaX_splitup.c
+++ b/src/dsaX_splitup.c
@@ -160,7 +160,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -171,7 +171,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -220,7 +220,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t nsplits = block_size/block_out;
   char * block, * output_buffer;
diff --git a/src/dsaX_store.c b/src/dsaX_store.c
index de53134..849c27c 100644
--- a/src/dsaX_store.c
+++ b/src/dsaX_store.c
@@ -112,7 +112,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DB
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to input buffer");
@@ -167,7 +167,7 @@ int main (int argc, char *argv[]) {
   char fnam[100];
   
 
-  syslog(LOG_INFO, "have ngulps %d, blocksize %llu, bout %llu",ngulps,blocksize,bout);
+  syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout);
 
   
   // main reading loop
@@ -202,7 +202,7 @@ int main (int argc, char *argv[]) {
     // for exiting
     if (bytes_read < blocksize) {
       observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu", bytes_read, blocksize);
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize);
     }
 
     // close block for reading
diff --git a/src/dsaX_trigger.c b/src/dsaX_trigger.c
index 26342a4..9592389 100644
--- a/src/dsaX_trigger.c
+++ b/src/dsaX_trigger.c
@@ -186,11 +186,11 @@ void control_thread (void * arg) {
       //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
       specnum = tmps;
       strcpy(footer_buf,tbuf);
-      syslog(LOG_INFO, "control_thread: received command to dump at %llu",specnum);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum);
     }
 	
     if (dump_pending)
-      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %llu",tmps);
+      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps);
   
     if (!dump_pending) dump_pending = 1;
     
@@ -341,7 +341,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DBs
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer");
@@ -352,7 +352,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output dada buffer");
@@ -525,9 +525,9 @@ int main (int argc, char *argv[]) {
 	  // DO writing using thread
 	  docopy = 1;
 	  
-	  syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	  syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
 	  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	  fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	  fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
 	  fclose(ofile);
 	  
 	  dumpnum++;
@@ -539,7 +539,7 @@ int main (int argc, char *argv[]) {
 
 	// if trigger arrived too late
 	if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	  syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum);
+	  syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
 
 	  bytes_copied=0;
 	  dump_pending=0;
@@ -550,7 +550,7 @@ int main (int argc, char *argv[]) {
       }
 
       // update current spec
-      syslog(LOG_INFO,"current_specnum %llu",current_specnum);
+      syslog(LOG_INFO,"current_specnum %lu",current_specnum);
       if (block_count < skips) {
 	block_count++;
       }
@@ -561,7 +561,7 @@ int main (int argc, char *argv[]) {
       // for exiting
       if (bytes_read < block_size) {
 	observation_complete = 1;
-	syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size);
+	syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
       }
 
       // close block for reading
diff --git a/src/dumpfil.c b/src/dumpfil.c
index 0e658a5..0be913c 100644
--- a/src/dumpfil.c
+++ b/src/dumpfil.c
@@ -202,7 +202,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -236,7 +236,7 @@ int main (int argc, char *argv[]) {
   
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input block size %llu\n",block_size);
+  syslog(LOG_INFO, "main: have input block size %lu\n",block_size);
   uint64_t  bytes_read = 0;
   uint64_t npackets = 1;
   char * block, * output_buffer;
diff --git a/src/fil2dada.c b/src/fil2dada.c
index c2235ec..c49f2b5 100644
--- a/src/fil2dada.c
+++ b/src/fil2dada.c
@@ -94,7 +94,9 @@ void get_string(FILE *inputfile, int *nbytes, char string[])
 }
 */
 
-/*int read_header(FILE *inputfile)
+int read_header(FILE *inputfile);
+/*
+int read_header(FILE *inputfile)
 {
   size_t nRead;
   char string[80], message[80];
@@ -353,7 +355,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -364,7 +366,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -413,7 +415,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t npackets = 1;
   char * block, * output_buffer;
@@ -431,17 +433,19 @@ int main (int argc, char *argv[]) {
       syslog(LOG_ERR, "cannot open file - will write zeros");
     }
     else {
-		
-      if (rhead) read_header(fin);
-//		fread(packet,block_out,1,fin);
-//		fclose(fin);
 
-//		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
       
-//      for (int i=0;i<npackets;i++)
-//		memcpy(output_buffer,packet,block_out);
-
-//		syslog(LOG_INFO, "Using input packet");
+      //		fread(packet,block_out,1,fin);
+      //		fclose(fin);
+      
+      //		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      
+      //      for (int i=0;i<npackets;i++)
+      //		memcpy(output_buffer,packet,block_out);
+      
+      //		syslog(LOG_INFO, "Using input packet");
       
     }
 
@@ -470,7 +474,8 @@ int main (int argc, char *argv[]) {
     else{
       fclose(fin);
       fin=fopen(fnam,"rb");
-      if (rhead) read_header(fin);
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
       fread(packet,block_out,1,fin);
     }
 
diff --git a/src/test_read.c b/src/test_read.c
index 0eefdc2..2b5730a 100644
--- a/src/test_read.c
+++ b/src/test_read.c
@@ -204,7 +204,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
diff --git a/src/test_write.c b/src/test_write.c
index b74e66b..32dd25d 100644
--- a/src/test_write.c
+++ b/src/test_write.c
@@ -28,6 +28,9 @@
 #include "dada_def.h"
 #include "dada_hdu.h"
 #include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
 #include "ipcbuf.h"
 #include "dada_affinity.h"
 #include "ascii_header.h"
@@ -261,7 +264,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -272,7 +275,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -358,7 +361,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/utils/packet.out b/utils/packet.out
index 435ed746680340c61a20bff1c7e2a38931ec8d31..de3b9a47bdebd485332ff433d1644d6d6ff77d33 100644
GIT binary patch
literal 4608
zcmWkyyPMoNaxbvu#5=^JXB~M6kgZfr&TOt3uP?Pv=l{^(=iJs^Zr@10a96|by-?Ex
zT6@u|fdYFSZ4>319K@sQ_bLJm7ny4)ql5q}+Nd(3EcGN3C6vm^lnbJ9<0H6gDSZM<
zsIhDmWRZd14u!vsy^l&SS;?eYojN1S+L)<G_;9LDL0ftbA?6G_^LPX4OuZtqckqSx
zUR={;)G0>Fy@bys#@$q?w4&11cS4K!bJ+uu$jU|m(*0U&jG*xjYrGd(I2{ZAaEj~0
zfmFK5d@-`MUuICk%e5r6K8MvXPD<ozSLA|TPpR(;j&d4ezn{EX{S>p0TEWRpWI{Wv
zv87<791j&AkhE@sR*OCmeO6;3)fHGC2WSycEj}NdBaM@z_Fnv!y{faXp@ptn!z4fq
z)d|PH0SYGqh<0m@@V@>c{l})%W_cv=x{7TEC!T~Dr5Hw9rlip4Hl!lxo<B`^qR=ZZ
z$yO?F!<!MQtYq=SdY775OQ9x}`eI;5MlC91nr`_j`@~)Bt-s|R{NSBYS(Z~CmNqwv
zH_xi7LFM|t{n@87;idr@`K{T&UMq#mk0KpkQ(LOD3fa`A_(^_Wf23_G7L_FTE!UvD
zcTl;qNoyt5PHVlLtr~Z#8&8@Ne71uG?lasv-G-sO)xT||1XKFFg5=Q&bhL=Cqk=1P
zS>?6w<-_)rA+5;!vNJmmUyW%D9jMa+f^^oW)?r+mIsd3jp6#jh)M|VFd!cH^;O)aM
z&Z<7B3UCb6p0Bb~)qxh=0fxL6igqy$V-Z&7W4k4B>`p0TUdV$canLln;IGKU$~g6(
zc9Ge`snSXtC6t+u@xg}~*KO&t!xSG~RHtWYQs73{m<~||U+mKum7;it`?=P|!6}Eo
z(texP`83mX)wnk4iTkF-X6?%bty}80x#T7tkLz?@qzN>y^Naz;R5!mT2e!72*?di6
zubidSsl}~(LQj5HsOQ4>e`t^?Rhu+VTpR|qc6}F`SGb#-M6*MF9y1C{`YyeCli}G9
z4UUL5hf9KnRL$Q{KThb@@;dONC(IIan{%wFm0uEO0*Y~`e&?5ORZuE%Q-l#%g^!ss
z<eflP4GF!n2V<Kh5vj518fp)B)w+98j5kg#!V}I{7D!j#qxn@!l1LM^v%B_kIs$A?
zyh=j}()e+m#b*=w-~>APV{G66^H39u(5hVIIPTS_6j03SGVE5^nUX7@snWM`r3(B{
zb3na&J_ofeaUjj5bc;8s%BN~yPvn$rNob4SUz%N2AI9OHiJ7`Xc5c@Cgwm~BW}Yai
z%O$NzxIF6g1Q8=_gx1%6sY6EFrk0<2qjV!YQ@{5^E~5Zo0Jnc*!sV{b!JWS#)YBu<
zCUu0Q_2&Sr#|%<#<jQ^NujlQ;s6)gO&_ZM#`T=k~uYFwN45ef>@MSu2nW6uH%x8a6
zxI04B*M!@rw4^HJ+LoEPyNjaB)yHl?H?o3_Yu=W(a5uKQ1f%yXf=WES$sMj>6eQ?}
z-I@TdD~%LDYuGFh;aD1F|9L%%_8KXTHdcD0+ZfJw+Dn;y>p&D>QqjiSS@M}nLrIS?
z?8a|da_y~(_++q5a(+?>vZ`+_Gb(P)Y$%L5EyJv;dl6oX1^IvnUay6S-}NOAnF!^(
z;Z(HD3}Intakjn{A@_P!xn~oG6PE#EF&$p-9jc<TTS3k$Xb(g7a^u_n;=EvmU)myk
zL9OoTt5^7Fmfs=Glb@7-u|Exq87waLFH>B&LKtPaKlzBQ#?G&k=+3e#Iv)QujCw>T
zLBGEivDb{%^u4I7;Sv9#;wK8ktQjo7be|IQ#$<6sd6T_ceMnxz{gbuqI0W63VQhGR
zT<0WQ$k#8dsZYCv6xrR_eY2V~1$6+nXnQNoC&RX}(p{YWyk(Ni?_B555*+_P3>xL~
zWTVQrm$fL*eQGAuH2b4%;5v0pjZ!~CR!65N^3P2-Mau@N^E&|ws=Q7{Ib6LmxYS~@
zg#|w=S1aEvLM4?%E5h?#J-~pP37P~|q@KS`cxJ4}#<A(JYCW1R+z8!h@^S_p-obUS
zL=+OKeBg7U^KH8-)<++|sl%YWQ?ZOo?uThUnYsi!VyJn6vOA||6u{V8+6^5K{QL#X
zStZim??-ljwc|q%2d5jm^KL7?YS?-N4W}EI&+Rv^IXbKQxi#dDQF^BK4kV-$OYi?o
zOz9W99o()`svXD=?O1|2IsXb52&gTj2^xK3jMS<}*PHv!_DVKvO*weX)eBukAf=pP
zBM@r`juVzIMjdtv)H-^gLjFyi6AfN`BB|;U!N*HhEBesX^8}-M#S-Vz_XBIP3#Xe?
zJn4EwJE;MM5_bA}(dn$bor=bs8c-*x!XltYk$Z9VXP=kkZ#%3#b5CeY-Afpm8Bqx4
z8>=&y5JPc9nbG#>H+zAyLp^Y0!*k?F!=3^#6K3x9Y93j%jhe>39NomIjyGQPQS{5s
z4)gTE(s&i(+~R0UCA{$4ujTe&blWmZ-!q(M;Idd9SdtnG_W(%~R<tF_j>$2h^AB!%
zqrNLtM<QsFRBum7JS|%eb<KDraZ`ROgttbeYPXiaI;UZs!W}#Qe(#_t85GwxbEII#
zv7-|MCgII4yk9X?{#a=J?`$}pLKzaOhx{S&ZlNl9sywjSiGa%dI?DB7dAdM*2KRGp
z6*U}K2Pc||S(k|+^RVU2CRFd;*PBuoUc$`$KOpjnqX4=Wx?;WWK~3NKwkWR;Jqeq)
zX{7Km^{o+LI9;g`_Qk(ryj{B?1yzyrE<pwn2j^a*QM;C-4}#O+V*eUTK}gug6ob;>
ze%wl3e`%BSHjXdp-5JNAc2$;s+eAN8@hTkpc$pJ_Sr+D>hq@<k$7dz+7$WW*dG>Oz
z#I=JP+0IV`>wQL6qc?FC>G)4+7`{5Wre9jfeSy3`Zj#*notFi|0=anEiO5>1lrH<I
z{&daUbQ6}`vLW2+aJ;MiknC(YQ9V9BJl5?#SW)ogj`*r}HLuGoD~vXLn0UX>N$LSO
z9;N4q33|i(s}`iyF`_r!zAWWRS+dK&DUv((>;bV!Z>C}&&p($>z8l7Nd`>4(nunv4
z$R?(p0q>LY++cmD)*0JbEf2Isua<>*%I$hwKQQELrSfusLDHNRFgVtndj`=)T!?ml
zKJ}w6kRHH)j01#f(Mv>Dz$Y~h2li<7n_`YHz$r35eaP~^aPFC39LCZHkMfqG)ESXO
z@zo1ldSK3FZW{T|<xM%huNw4t&Y&7G2~vH{p>_H_k{WcsX{<YV1(*X@ixjx!ka>gA
zbKHVC(UM1R#N*WlR&qLr{!TE?#YUIQ9Kg*ska56v#1Vo45h^7`xJ<xV+%HW2t7mqh
z)_UMXPr0h!=t6VS&&0Vu8V}_@jqqqz8s2XQJncNsJasC6m?D#r+7NiSU_ItE9|te1
zwUg8=mDFjXu}nMR;~mcxDjm1jOanj0sKM_a;bAw(ufPme6{FZZiLSpO)<O!+F035j
z+2t5`eA0Czi<%GWEh842rm)3$M>adCrL0z&<J^<-$mJ&Nd60E}F)zl6)5MX(ZMfv%
zU(<ufnuLsyW+2w3sN^)If!S}~&L+9+`hIy1uCReH(WJObYRcR#5Y^l;yLrqjsnF?}
z&%hwWQ4vh5G^^O(|1aOBcv0q`%2_>O)ff=DgoAj^G1e5=%@?Ws-GQyVrsZK-ELLE!
zQ#>?sg!k&WH;uy5=5_N)?ofO<h6_u|>k&wrQT7YZVpHWfqLj$$?3opd=Z?Xgb6cdo
zmG2*UPS|9b-eytmAWsbDX69F-@WU5(O$V<zQIztVhNqwVvGo(;_B`Ri4G`P(WrG#6
z)4U8%3NQ8(hv1?_P3s4Cn+0KRRVPYNZ1B0R@!(O05w48RATe=7L=bNH3;t<4=eF4h
zAnRGB*c~}}>Bs}K7+srx>NzBx%GHqxKpbkWovXFCctch@@VjN>Sr$3Xrm4#GT>ga^
znr$v-(Cla(7IcroTXXow<>eQF{ge_}Vl-C2m1~1jeOv=&Hw1@A^!Gf3lGirpjOQNd
zKTZqk4qMKd@2Qe3UfCCcqT+#@3Q|uGoB<G3eM|3NhJ*VBegV8mcU89!S%}-RW9+?v
zgsl5g!Y)YfrUPd<p;OAx0P3g2seZOgMV*x&6VE+<m-`zQh&*7>V-j4U?!d$Oahv^;
zJIa65g=6cGNNSK5;+SL&P3*e0(rlhC@f?!15?g%-p@91WgDc3x=7NWhj&-4j<doI^
Ztv}FbpV-oGQckS3_OrwC8b``A{XdgG9`67E

literal 4608
zcmWldL3iY~a)k@*>@1>yma~b604X~wTV!kAG8xIvI>&w|`~0}~3*sc(oR=AA>ru1i
z?T7?g$!dl`f%48GkeaR3-9#1Ye)kqX#TW*vicUaU5l(-F+FbiiA!RJ2MX+}%0G(V#
zCn)dgl9er$k7>L@^KdwI@rhbY<dih?I6j@BV6A>lZH|&;7HzJu2-!aQd@5!6Xh2>I
zfU^9G(<^7!va<kta#IS9lIX@$UBCXj3Y!9G;()faH-t&y4cz`|pioL`F7L?Rq6JbA
zw7Xo|SKB22W(huevHoK^rtfjh;zqAmT+zL<{i7;AqOpU!nwMt^XYuyL-+cR)E;|vo
z*m&BBe@?=q6y?quAM-p2(GIq*VMGbr*Wf8fQGZvYFc!h0E-4Dz$t*{6iJ+adA(h(X
zM1+D5XXk=Nml$<{Ra9p`g{*CCMGV{c5Y2{2h(=VxC#8!R9_!(`*{Wg1r6fPA`H#{=
z8docTWI2ZfNv?_}nCl#MaC#_ksGLxqhBzp2K}EZemGK#yr2==hF3x1FAF-9FPbZPe
z|DK<14l+{-9eyDrh;W2;yav9BlF<|ZY4r;z?oXiuE{%LFgKtgnd5|GWom6c>YW|6R
zwTwrR0-}h@u1KXZdj0mA?+j?oUDUOQ)bDST%VzT+p0M|aOmF5`vePQ}B@IF9QE;=J
z&8yJ?T!%nG@<H@qzk@Ze=|f#L;pgF5G&H$g%e5Rul~IY6CxD!_>5~)BukbH?RjQ1q
z^RV4kl|5`VPww^8@D$@|8ovguF|{^C@mnY0Hb$yemC}{cZ+W<dLcoB(XToCK1Y)ME
z@oIIrN3p)?fY96SXic_739e9@L)ksx{gUe`4eC4y**&Wi?=)u9$1Uip!4h7tKa=Iz
zz_v#FFbt4u9YtFl78S>W1g%Yx6;j+A)xVtPFKZF@QC`;-Jm#c4)P3P&kEUMKg<D3b
z8{E5=piaTFYL`Prq@?c>mbargZ?u^rh?2W7%knygWJ~@KOq-x2Dlv7rGD8hlm04XR
z2`=UH(5*3{{XCFgltZ!aA3WsD$L8O`Y%y%|PwjbBqDfcqLs>3R$%-zq9+t6Ht?JCS
zbYek8Zj2Jel@!17c-xAVvT&)ceKbKqELvWN5vVRSV3kAnXrB#Uez4O{?$!Nz1`~`@
z<4y^J{cLGV;4V+I|2BV1=MqUbyVi_H^ZS*kD?yjv4_qy!cY>;yV0g#}sjPVG-Z{Rc
zZfuDwt{oIwe_W+6hEhvBiI_#Sw_4x&k5=%Z7dK1DMI;b5&-?vg?(?|s+!f86=GhWn
z47bHq!YQj-e?w4a;%Uo{!uB?;B|lfLu|i-k9-4fRSL9u9C->)w@)J7cypsSgmP?SW
zie=)dRWbfFWqUSl6!`ZJ9}3RD1E_gX{IT&&7oE?x8eRQz(!=fUmKqi*W!IMI^h71)
zBF~HF5&A9`wU1)k^4&8P1~ld{s7ixPE{Wu`5s<5<FV2`LGuij2?twMy#;F{;XK2pq
z)2Vzw#~18=b4OiPzf?HdC`FP<o~x=7Rul2RdvfG@griS2d^Olslntl?BDbw^ggc6u
zaod{aE1xr9q-W88d?*{4+$IX}aKEu#+=?a^P*9V(csZBn|9N2|8Z4F=C?Qf%wv|mD
z+jR|2F-d2{GG(!wQrU8hYI?I|lNIeT+A#?%XMd9OC0zX?Z_$J5vw;6=3_*36WEaCM
z#aN$;O?6j5P+P`QpTkgAYH(NHFOrBk3z_wcnB|GYA-nRLc`!g4sJKs7&KpI&SwFj$
zKY=hQ#)Pk&_Y8WM8$M@BGs4V;do}Af?|DdSf0jAoOhVjof%tEWNgd6(bNybCTGM5y
z6oD{pixXLJ;&K?6MyrSg)$(d<-|l^vAtpA@thybfSA)k!<>eRVuuM_5lyhwV2`{2_
z4(#3yZs2y$*2h6Sn4<iY7&=2MUOyjF#eSi|i$G0uQhsrI9y`O9c+j?1GBcZ`m>kAE
z-2YcmW$(I5T9wr|lqs&+GYezHmmztfPA9wXtz>&FJ!b@a>m0t6)?vL;wMnwEp~NGz
zlT3k7B0J_Ta9L##q$3-;cLgY_kz#QP*N+k|cCHU9+da+UWbry+l(iGIqemOH&DOt*
z%%y`=6UcNqDpP|jMi!M;;#+`_B_zlorC0?t*{V{ecwc@-Hwtqp_0Of8$#vXoSfzZb
zle54Q&gCz((91=8RyGTNpbfA*J9*DfW9eUo*vL}2K-cNg-L=sny+nO4B!`g<jbI(u
zOaUnH;NI_Kq>H9toGye0V1eR=h4J!7RlnmSerks8#)CW+Wtp2-c;(gy<s5s<kY*OF
z%TzaH7FW1{-IB?olj~V<7y<wR;lBZ4TkJ_un%S$%wt&k5C3ZKmMe?ywPaTHxLVL0~
z@+7~gkM#KJ7~8L*J@M>wnw$Ma)|1RjHGY|$IO{u9J4T3nd|w_+uT38Glqg=KBpzyE
z#V%W!lVn_Z92oD%*tCbhl(=QTh-<`~XBHE>BxlY{Q*m2lE8wO;6P5DW7fIztp(o9E
zWMxB0kGIb(Yha10lVy>b+I2Iba&XRvRDaV|L&kJ1gfH(39RZy)IVU)_!92^Kw5SQS
zl5;X>xfdi40Cx@*RO*8X*p)vP#m5&#<yfvsZ>*A*%DfcYUY_qE*@L7j+e;t8*^0rw
zZc}iA@vZHSAjNc-wt!=ZkGTauiv6m!m{n>G1@gW;{_udxrVcD;7tO7?PSy6k?BS%C
zz2@kOl<J;*pNx7`92X%#nAr}r3vSF#9r-bleJ>f^aOCXM-erU$XEkPf<Q~;~nMf$X
ziK{M-;h8Ta*x8xRoZ?`wV%UAZA#;C(3cgT@Q%+!sY5OL!<LbU+XGZV9KTt&+UQ>oa
ztHskx0*GI6rx}S-Ybcz-0<##Wt6h7{6#k|C#62BV*njeRy%c^?lB!NBWnrT_WX0d>
z6pZ633Z}rg=A|CYC)T-minL0Rx5psaTC?Kv`yrmB9Ic42c!2rJSILBQ)pT&p@Mb(e
z_R_$RZag2sejuiZGM_loQ>FGB<TQ-J>zH5<$RWhBabvj}TcHb~)x%lxs7iZ^xfD(}
z_^!X*e$svgIGK`9p+fP}-GGZdsEnfU$DP_=Z+-rnID9A;AGvu4U|^|qVG^)295n|-
z;W%awDdd_o?+RA3x-X^7rtUaOvfk%M7*)&pUlgsvzy^ICS2ueOQ9P6OEpT{DD&nA#
zjMM4e;2nF;D#`;~I`d>ODt0J~GqSZY@{&-=ol6GIYsDm<T;Jj*%_`)*oP2)@=QAVA
z1eYIVo5>aNpKiAQQ6T!kP2zLdP4&vL8j=d(doBs!KKhh72&=gACa3HDey)xlBE-!-
zb4MJ|;GBOY)kZyki=9xGuTva%3=FMI<mf6{OEyvUe8v<>SdK7rYaMTC%;oI$X)VG^
zKkTD!)X-}{F_o=igz$y1vQ?d+kva30M`mZOxo@}W<LK}^UYTC)c^5zsEJMC-D;1on
zQ`9~}*H)92HJ*<bytsYWMzsAXAxdeS6&EUcA4OBdc^zYyI37SKFqK7d&}S|G@B`-e
zHB)hnjHfjm>{!F&zUAW~Y6?w5Ge-x>Q}K(JV&mjnaHxys+c~PbA1`aMVQyM&wO(Jg
zhRJ^ru{Zy&$2mdeeH%8KHJ<WZ+WS><DuzuRI16wjuZ3uY0-U?+&qG(D2M7t}z~&Io
zy*c7!E}raDIbS5))Aet>8;-p=JxWvMnw`^Ffj>A?FqgLFHNsnVc3TT|YvQTR1J#KE
zyB|OQu@%P|pt50<Pobn2My@UkMy>YKsTFh~?E@NKWfN_-dgKn+XD$|hBrkfpIXj+h
zFCg+l#*0l<a~(!rXPBgpCy^)U&vp`bg+r%xU$Ty_d!brfyX?;U@_Cmkf-_ozbMH^{
zN%VT=$eYmp#?w<{ohAzFbv~h8?|1=u!-4OrYV_xoEK=&$>6_POcqtc_H{@gtqq~qG
zEajk-R`+ZM^y7+{xUG2KF&M=^aLDFY5_2lNqq3oj{lJg<{4|DcF)QGDa4D64tyIh1
zOvP|_Y$P{W!6<4I3XG*&zbp$0A4-ew#PfWFL6~I?5Odx$85fEKzj9J-`8~-Ao|)t7
zYTvpVs>1YPI`0!0YhQ9I;g6#i9?nXYRlVgG0pCR-w$zMOzX&R3zb-BrcM}`lJP!L*
zKC#$$nxh2Tnqxelc;<A-zL#T<3@JK1@UmZ<@seC_Kj(z=lNg(vkjDOnB4Az8_2-?k
iOxL{k>!{rRZ+Qbk4ksGj1=lYPNu~d)mSo8toBsz)I=hts


From 693803ae25199b5a3edd87a06402e720765d314f Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth26@gmail.com>
Date: Tue, 4 Jun 2024 19:36:00 -0700
Subject: [PATCH 02/30] Add XGPU dependency, used local fork for now

---
 CMakeLists.txt                                | 10 +++---
 src/CMakeLists.txt                            | 33 ++++++++++---------
 ..._correlator.cu => dsaX_cuda_correlator.cu} |  5 ++-
 src/dsaX_testdada.c                           |  2 +-
 src/dsaX_wrangle.c                            |  6 ++--
 src/dsaX_xgpu.cu                              | 12 ++++---
 6 files changed, 37 insertions(+), 31 deletions(-)
 rename src/{cuda_correlator.cu => dsaX_cuda_correlator.cu} (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66682b6..f3e491c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,18 +108,16 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA)
   FetchContent_MakeAvailable(PSRDada)
 endif()
 
+# Get XGPU dependency
 option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON)
 if(DSA_XENGINE_DOWNLOAD_XGPU) 
   FetchContent_Declare(
-    XGPU
-    GIT_REPOSITORY https://github.com/GPU-correlators/xGPU.git
-    GIT_TAG        7e85bd5da619c026e1bfbb64325ed122323b8854
+    xGPU
+    GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
+    GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
     )
   FetchContent_MakeAvailable(XGPU)
 endif()
 
-
 # Add src
 add_subdirectory(src)
-
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1b0a548..de025f6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,11 @@
 #enable_language(CUDA)
 
-set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
 include_directories(${PSRDada_SOURCE_DIR}/src)
+include_directories(${xGPU_SOURCE_DIR}/src)
+
+set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+
 
 add_executable(test_write test_write.c)
 target_link_libraries(test_write ${PSRDada_LIB})
@@ -80,24 +84,21 @@ if(0)
 endif()
 
 # DMH: XGPU dependencies
-if(0)
-  add_executable(dsaX_wrangle dsaX_wrangle.c)
-  target_link_libraries(dsaX_wrangle ${PSRDada_LIB})
-  
-  add_executable(dsaX_testdada dsaX_testdada.c)
-  target_link_libraries(dsaX_testdada ${PSRDada_LIB})
-  
-  add_executable(dsaX_bfCorr dsaX_bfCorr.cu)
-  target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+add_executable(dsaX_wrangle dsaX_wrangle.c)
+target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${XGPU_LIB})
 
-  # DMH: Fix CUBE error
-  add_executable(dsaX_xgpu dsaX_xgpu.cu)
-  target_link_libraries(dsaX_xgpu ${PSRDada_LIB})
+add_executable(dsaX_testdada dsaX_testdada.c)
+target_link_libraries(dsaX_testdada ${PSRDada_LIB})
 
-  add_executable(cuda_correlator cuda_correlator.cu)
-  target_link_libraries(cuda_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+add_executable(dsaX_bfCorr dsaX_bfCorr.cu)
+target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
 
-endif()
+# DMH: Fix CUBE error
+add_executable(dsaX_xgpu dsaX_xgpu.cu)
+target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
+
+add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
+target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
 
 add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
 target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
diff --git a/src/cuda_correlator.cu b/src/dsaX_cuda_correlator.cu
similarity index 97%
rename from src/cuda_correlator.cu
rename to src/dsaX_cuda_correlator.cu
index 9d9e66d..3bebd09 100644
--- a/src/cuda_correlator.cu
+++ b/src/dsaX_cuda_correlator.cu
@@ -1,6 +1,8 @@
 // -*- c++ -*-
 /* will run xgpu */
 /* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -222,7 +224,8 @@ int main(int argc, char** argv) {
 #ifdef RUNTIME_STATS
       clock_gettime(CLOCK_MONOTONIC, &tic);
 #endif
-      xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
+      //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
+      xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp);
 #ifdef RUNTIME_STATS
       clock_gettime(CLOCK_MONOTONIC, &toc);
 #endif
diff --git a/src/dsaX_testdada.c b/src/dsaX_testdada.c
index c12d704..bbe7640 100644
--- a/src/dsaX_testdada.c
+++ b/src/dsaX_testdada.c
@@ -114,7 +114,7 @@ int main (int argc, char *argv[]) {
     }
   
   // DADA stuff  
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   dada_hdu_connect (hdu_in);
 
diff --git a/src/dsaX_wrangle.c b/src/dsaX_wrangle.c
index 5825ec6..19507d4 100644
--- a/src/dsaX_wrangle.c
+++ b/src/dsaX_wrangle.c
@@ -217,7 +217,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -228,7 +228,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -277,7 +277,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block;
   uint64_t written, block_id;
diff --git a/src/dsaX_xgpu.cu b/src/dsaX_xgpu.cu
index a64217b..d065848 100644
--- a/src/dsaX_xgpu.cu
+++ b/src/dsaX_xgpu.cu
@@ -1,6 +1,8 @@
 // -*- c++ -*-
 /* will run xgpu */
 /* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -177,7 +179,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -188,7 +190,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -283,7 +285,8 @@ int main (int argc, char *argv[]) {
 
     cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice);
     promoter<<<6291456,32>>>(d_din,d_dout);
-    xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+    //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+    xgpu_error = xgpuCudaXengine(&context, syncOp);
     xgpuClearDeviceIntegrationBuffer(&context);
 
   }
@@ -315,7 +318,8 @@ int main (int argc, char *argv[]) {
       cudaDeviceSynchronize();
     
       // run xgpu
-      xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+      //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+      xgpu_error = xgpuCudaXengine(&context, syncOp);
       if(xgpu_error) {
 	syslog(LOG_ERR, "xGPU error %d\n", xgpu_error);
 	return EXIT_FAILURE;

From bb142b1b6f0dc1196bca68d2d0815f4718e2f23c Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth26@gmail.com>
Date: Wed, 5 Jun 2024 15:46:01 -0700
Subject: [PATCH 03/30] mid cmake upgrade

---
 CMakeLists.txt      |  36 +++++++++++++++++++++++++----
 src/CMakeLists.txt  |  55 ++++++++++++++++++++++++++++++++++++++++++--
 src/dsaX_bfCorr.cu  |  40 +++++++++++++++++++++++++++-----
 src/dsaX_dbnic.c    |   4 ++--
 utils/gen_packet.py |  12 ----------
 utils/packet.out    | Bin 4608 -> 4608 bytes
 6 files changed, 121 insertions(+), 26 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3e491c..ae509fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,8 +55,8 @@ set(DSA_XENGINE_GPU_ARCH_SUFFIX
   CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
 set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
 #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
-mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
-mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+#mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
+#mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
 
 string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE)
 list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
@@ -103,7 +103,7 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA)
   FetchContent_Declare(
     PSRDada
     GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code
-    GIT_TAG        008afa70393ae2df11efba0cc8d0b95cda599c02
+    #GIT_TAG        008afa70393ae2df11efba0cc8d0b95cda599c02
     )
   FetchContent_MakeAvailable(PSRDada)
 endif()
@@ -114,10 +114,38 @@ if(DSA_XENGINE_DOWNLOAD_XGPU)
   FetchContent_Declare(
     xGPU
     GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
-    GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
+    #GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
     )
   FetchContent_MakeAvailable(XGPU)
 endif()
 
+# Get TCC dependency
+option(DSA_XENGINE_DOWNLOAD_TCC "Download and build TCC" ON)
+if(DSA_XENGINE_DOWNLOAD_TCC) 
+  FetchContent_Declare(
+    TCC
+    GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
+    #GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
+    )
+  FetchContent_MakeAvailable(TCC)
+endif()
+add_custom_command(
+  OUTPUT "file.txt"
+  WORKING_DIRECTORY ${TCC_SOURCE_DIR}
+  COMMAND "sed -i 's/libtcc\///g' libtcc/*.h libtcc/*.cc"
+  )  
+
+
 # Add src
 add_subdirectory(src)
+
+# Install project cmake targets
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  ${PROJECT_NAME}-config-version.cmake
+  VERSION ${DSA_XENGINE_VERSION}
+  COMPATIBILITY AnyNewerVersion
+  )
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index de025f6..54467d7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,7 +4,7 @@ include_directories(${PSRDada_SOURCE_DIR}/src)
 include_directories(${xGPU_SOURCE_DIR}/src)
 
 set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.so)
 
 
 add_executable(test_write test_write.c)
@@ -50,6 +50,9 @@ target_link_libraries(dsaX_fluff ${PSRDada_LIB})
 add_executable(dsaX_nicdb dsaX_nicdb.c)
 target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
 
+add_executable(dsaX_dbnic dsaX_dbnic.c)
+target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
+
 add_executable(dsaX_capture dsaX_capture.c)
 target_link_libraries(dsaX_capture ${PSRDada_LIB})
 
@@ -85,7 +88,7 @@ endif()
 
 # DMH: XGPU dependencies
 add_executable(dsaX_wrangle dsaX_wrangle.c)
-target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${XGPU_LIB})
+target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} )
 
 add_executable(dsaX_testdada dsaX_testdada.c)
 target_link_libraries(dsaX_testdada ${PSRDada_LIB})
@@ -118,3 +121,51 @@ target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
 add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
 target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
 
+# install step for header files
+set(DSA_XENGINE_HEADERS
+  # cmake-format: sortable
+  dsaX_capture.h
+  dsaX_capture_manythread.h
+  dsaX_capture_pcap.h
+  dsaX_def.h
+  )
+install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+
+# install step for executables
+install(TARGETS
+  # cmake-format: sortable
+  dsaX_beamformer
+  dsaX_beamformer_passon
+  dsaX_xgpu
+  dsaX_reorder_raw
+  dsaX_fake
+  dsaX_capture
+  dsaX_capture_thread
+  dsaX_capture_manythread
+  dsaX_dbnic
+  dsaX_nicdb
+  dsaX_split
+  dsaX_wrangle
+  fil2dada
+  dumpfil
+  dsaX_simplesplit
+  dsaX_store
+  dsaX_trigger
+  dsaX_filTrigger
+  dsaX_beamformer_offline
+  dsaX_splitup
+  cuda_correlator
+  dsaX_copydb
+  dsaX_bfCorr
+  dsaX_merge
+  
+  #fitsio dep
+  # dsaX_writevis
+  
+  #sigproc dep 
+  # dsaX_writeFil
+  # dsaX_splice
+  # gpu_flagger
+  RUNTIME DESTINATION
+  bin
+  )
diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu
index 01c45e1..25b9262 100644
--- a/src/dsaX_bfCorr.cu
+++ b/src/dsaX_bfCorr.cu
@@ -45,7 +45,7 @@ using std::endl;
 #define sep 1.0 // arcmin
 
 /* global variables */
-int DEBUG = 0;
+int DEBUG = 1;
 
 // define structure that carries around device memory
 typedef struct dmem {
@@ -264,8 +264,34 @@ __global__ void transpose_matrix_float(half * idata, half * odata) {
 
 }
 
+// arbitrary transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) {
 
-// function to copy amd reorder d_input to d_r and d_i
+  __shared__ in_prec tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+
+// function to copy and reorder d_input to d_r and d_i
 // input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
 // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
 // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
@@ -1181,7 +1207,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   if (bf==0) 
     syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
   else
@@ -1209,6 +1235,7 @@ int main (int argc, char *argv[]) {
 
     // do stuff
     //begin = clock();
+    // loop
     if (bf==0) {
       if (DEBUG) syslog(LOG_INFO,"run correlator");
       dcorrelator(&d);
@@ -1226,7 +1253,8 @@ int main (int argc, char *argv[]) {
     cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
     
     // write to output
-    
+
+    // write to host
     written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
     if (written < block_out)
       {
@@ -1237,13 +1265,13 @@ int main (int argc, char *argv[]) {
     
     if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
     blocks++;
-
+    // loop end
     
       
     // finish up
     if (bytes_read < block_size)
       observation_complete = 1;
-
+    
     ipcio_close_block_read (hdu_in->data_block, bytes_read);
     
   }
diff --git a/src/dsaX_dbnic.c b/src/dsaX_dbnic.c
index 40407ee..83e3e4a 100644
--- a/src/dsaX_dbnic.c
+++ b/src/dsaX_dbnic.c
@@ -261,7 +261,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -294,7 +294,7 @@ int main (int argc, char *argv[]) {
   
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size);
   uint64_t  bytes_read = 0;
   char *block;
   uint64_t written, block_id;
diff --git a/utils/gen_packet.py b/utils/gen_packet.py
index 8803832..2ae1bee 100644
--- a/utils/gen_packet.py
+++ b/utils/gen_packet.py
@@ -214,15 +214,3 @@ def histo_test(data):
 
     
 #plot_spectrum(out_str,pol=1,ant=1)
-
-
-    
-
-
-
-    
-        
-    
-    
-        
-    
diff --git a/utils/packet.out b/utils/packet.out
index de3b9a47bdebd485332ff433d1644d6d6ff77d33..34e6909992a277b32cd475dc5c7f9f04da910749 100644
GIT binary patch
literal 4608
zcmW+)&v)E5k}go*Q$+me%x&EO*`C{4r0iT1wY}%v&3pTodiT8ZA~|PEWarH(RuiCd
z8#hp3=VlSL<vDdfu?ZAVUwuCWE*0v0@BHjc!5nlz*{lKasl@<AnX%mn=g?X=<Tw~T
z#YG9+I+@!fbx`F0rzk6<o+OnbN^+yS=`}=QzAo#KL-N7_<heT0q3Ysf5A$4fHp={o
z=TSK=C@TIuK{1PV(WCWyo!ylLqv7$o1G$SNh6)A}u!SB|?t>e4S(FWz+KX5=k0#xq
zA)|sjI#5k2{iJ=yq++YCR0dKIVIz(?m~kZogI6z&!a&zbZ_VX!pkwSO$i?_`U7D?Q
zO6%YMTb-6B32}ihoNJcxl&B%ce@x}Pn3oTga+5A|*b1zJnj55|on7qw&Rlh;Q+HJR
zojoghQ;sU#$lqxTlvKGwe5ocJ$&$6|KKn7<YdYEocjHc1r8(B#xeF0zKG5XESEYpC
zxVPh5EdnV2QaMNiV!?}PtaGvADD%IAD|lVVr?=-J8ei8cm)^{satahGebjYQ|N9vf
z2%`H<wynxl;c7IdO(#tOaYC<~Cll8rWV?8hUhVnXrp(V~-!I&%gC@1Y)CMd3w5W*O
zN$HY8@nY2_32=2SNKM@yh0aa>C2Ev|MHx<4V3Hm{h)y?Kp&FueZL=#d6AId+dFrQ8
zqqFDwx?}{*DktmHlwLSX2+;Uv#8DNQJSg+ppHWyp5~0FbpJ-B|PZ`6Sq6S4!G;e&;
z%I6@`3L@Z9Ku)5xd#t9$8d7nDfc>{`Wn;lZpMP{kOx+Rg^2#-<E$V%pI)k-t;xOs1
z5anKBKR7Y{%-_9PHt5y{K#H8StlPmiH$GkYpn*V-h{)rT`RcrmNxc3MljNS%{jza?
zcq2%2^R!Csq_kM|m!pNSb_FNl)Y|JHms8ee?RP@$wWkk#u9{>?B9ULP@JaQrUixR@
zm*7p~f58eQ!P$b$i0<3AVzae6Lwnb(@msxf8IRD2h1yf!M{7jvbKxdZc<)l`BuW*g
zbI!YGo$jxRmdi|z4E@nK6&@2}1PD#HT>u$bM7p?pWeWdBk3s-hFizHtv#;2M29B3^
zdLumMp-qjmsfeB7H=EQ;{MxzpUq|rzWS#JQxu9!whL@BY89ELG(&uVN>)UU<kT=AW
zixK6clzhf3^=FACf?rqXmI5eAYYegoZGaj{Ls;y?Jls&u@3bMFSKCPS7WK~<{{@Dn
z7?izr2wJYK!qPZwzW)v+oaiLAC{C<KrIT5*M1LtQXolWqFRTjU7L*Vh)pv=ha8ZkS
zV7km)aK?>?_sXH~C6B5@Wmki{`lB=a#w}MMYrezVvywXohr0YSlqqEe^+u@ZYMwlr
zrLa>0TXH_5Q-bmQR1Mw`(;pb+o@^BrxaAsMr+ej*0IT0I=E3kv5}`Q4RfCIm^rR=F
zm9;x7Rgz!_9%$%%hp<kveyQP&>eF?dd6sF6@?*l4<T!t$|I=rvqpFxdmc$?_!(?My
zRhO{s6b>**itB0%sN3O<7lY&tzQ}WBRqmN{YVYqKsJysaXubTB_Q4v7m^U&+8pNV$
zP^6s6Nl|Mpx5tZG4hrPa9(W9~;VFhqes6H9;<AD+eDec7SHGA+@lgzqzACEql~9cK
zpZt+&yHYdRHP_C#c^W`33XIH)(Ym?))f`Z7@{$u|NXpze8B1kxJ#N(@AD@z_7^}ir
zW=#iH>M!AK7NRNFg<DgA`c(2xiDD&Y?TBROL?xW0jt|Uxwc)WkiTgK(68SIkLXVQM
zx6KjCIzRmvKI5uRFdh9zNZBCiel0fLdRJUFzGCJ~lC@{mS}#r?$9E_frCD>GEzbBN
zJqa<(e#qauBg>pf*zqtKVRzIjn_N-21+KqiP7=5MO<?3=a2~K;^&&2gXt0>3x#{=A
zBFmyHz$CPaN;S66>70Al$)AoU)R;#JmHs<Vz+s_AGuvcmEG;;t%4VcM<3Czis8?y+
zxYH_P?vIU!l+L;rv~~<pATRmF8oJdsNm|=VAH<0lN`6ShO%R_YIUXX}xKi#I1oe^i
zKy6*WGJ__j^5dBF)xo=@A(Q6=3ud^79OU(+<iZplo0?jYb6xJt_Iwr#L{`V3PE0_v
zSeR&(4!KJ^W_q05a+=(&*8}t1MT~uvx-9>ui($&nILH$?buqVhIe=FurX=I93-)gv
zmZty*?!M^l5q8c2@uf_>mGQGa9<+GLcag^dPi~uf)xQtU5t#b-TsEj-5a;aw3F^9Y
zN*V!;-b*N?RGD%_BjM;+*LfzAR1e`2Gl@ynu0h2p^5w;<G$WXvbH~c%-V1d#sS;9c
z7C&TOrIZ$IJ&dmgc4C*JRWAm?lRGtYC(kz2(lO1SXcId|q-GQ3G&_nfMYpW0MH<TT
zLt0LHSGFs8*t79ickVN)4J%2$OZZS>m7pXD44loY(_cxOz|RU^bv=!=1iR1G`V6vm
z2~t@iZKr%yms;WZx^&)@yAa6sl40)fL({ri$KqaoQb%OTqPQ3qu;--h-!g-wwM5nK
z2dr{Y`?|4^$eFEFzT55DbdsxlFb+Fx2RQ6;d6y0uTe;22_ZZBo>dLX?PYy?8wNK}u
z@8z|Zte$RmzGIRn+A*W5)rywggL{pvumhVI5spX^x`C$@_>eRAB?clUc)bv}^Z0qd
zXtZp*CKg?g<x0G4)Xz<}$+S2>U*~&@PNRXozbi2CFjT+dhn_hDhGF=d{7$pHZPXAb
zZ0pNh>m{IaO~O1$o_a+Ue-w6eIWtL8UJF~S_AUV6IfhYi$M3bPWH7N8Exfr?pNM}~
z|HJFB%Ds3s>$tCZ4Ytx|krLBMQ2O=WxZ_kRK|A4u{*8QHa23ftgYFBv#r(8nIL++s
zdo5gT$4_ryKhhjce)8-Zfsf5H-z^Nr9~rMM=TWnfq)R;E1e?RH=u})hm?5$KTaV^h
z3+Ctz-!%bvHeB_Z|If*S#S<?!vTZJUyGVY>Ruc6k3@r5xC8peTmD2N_eYhrWVwRG{
z=06&j)ui)8%mjG>|4QP)QPs9(cvwQtpPbk~eLG5Fl=pJ+ABANjW?_DuWWa_+8(8+g
z=re@+fXkS#@@r)$c8c(ybo)4q^X#1l`SkiP2H*HJa_<DD$I~M(;$P_>w%rE4vrTPE
zP4X()N|>5|#~QdcL$u`YuhUvxkv<Pg2zl?CW9P}AU^BCrv(W|>Jx49WRmzQ`om9;e
zq#w5%v9rJmUT1#*p6PR8<ow25AxDPK&6$;l$$?p&O%{9Mi8Df<ynH7pa4?X=GROnZ
zN1ZwIh;(K)$=U9USQGNjWs><arfJ`d7ALgC0z$9D2<=!3O~ZjT9(o?EsqO->z%kc#
z!$CuJTr!>%n(KXWo;!hYD=!rsZzr;1eweo#D!w<OK19~@yBi)B7y@G$Ow$%*eP?UW
zLNC4*rpAcxU>R4%p6?^u$mx48w=sACQQxXfW^cwa>~E#w7$H=ORS0+sIwL!cznp4U
z;%2mR<SgpznclCYegnm4nK=hB%DNwdSM!pdP`A1C?^GjLBo$NAV%;){_+Cdwz02J@
zHs6*@bsHH^)DcS$ThG0A=9K)OU_ATU+(gmnYQGdtJ+f?PwxLPn$LEKx?m5FS$#bq_
z&rqd6xGT3%8ICAadt_1*m+w3$qy{ZOKMBJeCK@J#w5)7;n{8nFBz(Q$L<~`G81lJN
zRIWInCG&B|OI-UqccX6Q#=9YiR<Nh>CNT-XU<3|f@bXAQZ#fv<ebbV(utj`Bac@5|
zbz}|<nk^kJu@xM5XPe{~a~erbD!Eg6X<7EBZ;1&wsK5L^FX&@nug}!wLE@(Kj)eMN
z2nE}FX4P~RZY0ppT&9q)=A*b{)NIv7!$7KWF$164Ulb(g^C|}y-Ggvi>eu?@D6O%&
zyk4qS%+`mMg4J&I^(Zu3Lw2AKTu7%w%#Nv%DM$*1-n}3716if`Uj*G((9nFEf-ZEp
zDPo|V#tT76e-x7S1NGWn8@}WFfn8$D^swxC+^VX_V(HzhlG>YJ7=y?%Ahh@KEN8aU
zP2@D@FVrfzrmWa*<=ThDE&$SBsU6v!{nMDS*1;0x8<7&4M?kiW4(|1Q(WpL5D!?WC
zsW7Xa^LP#I27a>2n21dIARbD1PRu3=qjYtL+_9zEuy<v9<tjTKm}hz&=@D{1vfq_^
z&*4J!S$$wndJhZtDrdCp_c_w})Nlx{IWF*PwFGqj1~h)sH_4V+>$qaext-6DQS6JP
Q(8{YQee{VQIaCb)58tM{bpQYW

literal 4608
zcmWkyyPMoNaxbvu#5=^JXB~M6kgZfr&TOt3uP?Pv=l{^(=iJs^Zr@10a96|by-?Ex
zT6@u|fdYFSZ4>319K@sQ_bLJm7ny4)ql5q}+Nd(3EcGN3C6vm^lnbJ9<0H6gDSZM<
zsIhDmWRZd14u!vsy^l&SS;?eYojN1S+L)<G_;9LDL0ftbA?6G_^LPX4OuZtqckqSx
zUR={;)G0>Fy@bys#@$q?w4&11cS4K!bJ+uu$jU|m(*0U&jG*xjYrGd(I2{ZAaEj~0
zfmFK5d@-`MUuICk%e5r6K8MvXPD<ozSLA|TPpR(;j&d4ezn{EX{S>p0TEWRpWI{Wv
zv87<791j&AkhE@sR*OCmeO6;3)fHGC2WSycEj}NdBaM@z_Fnv!y{faXp@ptn!z4fq
z)d|PH0SYGqh<0m@@V@>c{l})%W_cv=x{7TEC!T~Dr5Hw9rlip4Hl!lxo<B`^qR=ZZ
z$yO?F!<!MQtYq=SdY775OQ9x}`eI;5MlC91nr`_j`@~)Bt-s|R{NSBYS(Z~CmNqwv
zH_xi7LFM|t{n@87;idr@`K{T&UMq#mk0KpkQ(LOD3fa`A_(^_Wf23_G7L_FTE!UvD
zcTl;qNoyt5PHVlLtr~Z#8&8@Ne71uG?lasv-G-sO)xT||1XKFFg5=Q&bhL=Cqk=1P
zS>?6w<-_)rA+5;!vNJmmUyW%D9jMa+f^^oW)?r+mIsd3jp6#jh)M|VFd!cH^;O)aM
z&Z<7B3UCb6p0Bb~)qxh=0fxL6igqy$V-Z&7W4k4B>`p0TUdV$canLln;IGKU$~g6(
zc9Ge`snSXtC6t+u@xg}~*KO&t!xSG~RHtWYQs73{m<~||U+mKum7;it`?=P|!6}Eo
z(texP`83mX)wnk4iTkF-X6?%bty}80x#T7tkLz?@qzN>y^Naz;R5!mT2e!72*?di6
zubidSsl}~(LQj5HsOQ4>e`t^?Rhu+VTpR|qc6}F`SGb#-M6*MF9y1C{`YyeCli}G9
z4UUL5hf9KnRL$Q{KThb@@;dONC(IIan{%wFm0uEO0*Y~`e&?5ORZuE%Q-l#%g^!ss
z<eflP4GF!n2V<Kh5vj518fp)B)w+98j5kg#!V}I{7D!j#qxn@!l1LM^v%B_kIs$A?
zyh=j}()e+m#b*=w-~>APV{G66^H39u(5hVIIPTS_6j03SGVE5^nUX7@snWM`r3(B{
zb3na&J_ofeaUjj5bc;8s%BN~yPvn$rNob4SUz%N2AI9OHiJ7`Xc5c@Cgwm~BW}Yai
z%O$NzxIF6g1Q8=_gx1%6sY6EFrk0<2qjV!YQ@{5^E~5Zo0Jnc*!sV{b!JWS#)YBu<
zCUu0Q_2&Sr#|%<#<jQ^NujlQ;s6)gO&_ZM#`T=k~uYFwN45ef>@MSu2nW6uH%x8a6
zxI04B*M!@rw4^HJ+LoEPyNjaB)yHl?H?o3_Yu=W(a5uKQ1f%yXf=WES$sMj>6eQ?}
z-I@TdD~%LDYuGFh;aD1F|9L%%_8KXTHdcD0+ZfJw+Dn;y>p&D>QqjiSS@M}nLrIS?
z?8a|da_y~(_++q5a(+?>vZ`+_Gb(P)Y$%L5EyJv;dl6oX1^IvnUay6S-}NOAnF!^(
z;Z(HD3}Intakjn{A@_P!xn~oG6PE#EF&$p-9jc<TTS3k$Xb(g7a^u_n;=EvmU)myk
zL9OoTt5^7Fmfs=Glb@7-u|Exq87waLFH>B&LKtPaKlzBQ#?G&k=+3e#Iv)QujCw>T
zLBGEivDb{%^u4I7;Sv9#;wK8ktQjo7be|IQ#$<6sd6T_ceMnxz{gbuqI0W63VQhGR
zT<0WQ$k#8dsZYCv6xrR_eY2V~1$6+nXnQNoC&RX}(p{YWyk(Ni?_B555*+_P3>xL~
zWTVQrm$fL*eQGAuH2b4%;5v0pjZ!~CR!65N^3P2-Mau@N^E&|ws=Q7{Ib6LmxYS~@
zg#|w=S1aEvLM4?%E5h?#J-~pP37P~|q@KS`cxJ4}#<A(JYCW1R+z8!h@^S_p-obUS
zL=+OKeBg7U^KH8-)<++|sl%YWQ?ZOo?uThUnYsi!VyJn6vOA||6u{V8+6^5K{QL#X
zStZim??-ljwc|q%2d5jm^KL7?YS?-N4W}EI&+Rv^IXbKQxi#dDQF^BK4kV-$OYi?o
zOz9W99o()`svXD=?O1|2IsXb52&gTj2^xK3jMS<}*PHv!_DVKvO*weX)eBukAf=pP
zBM@r`juVzIMjdtv)H-^gLjFyi6AfN`BB|;U!N*HhEBesX^8}-M#S-Vz_XBIP3#Xe?
zJn4EwJE;MM5_bA}(dn$bor=bs8c-*x!XltYk$Z9VXP=kkZ#%3#b5CeY-Afpm8Bqx4
z8>=&y5JPc9nbG#>H+zAyLp^Y0!*k?F!=3^#6K3x9Y93j%jhe>39NomIjyGQPQS{5s
z4)gTE(s&i(+~R0UCA{$4ujTe&blWmZ-!q(M;Idd9SdtnG_W(%~R<tF_j>$2h^AB!%
zqrNLtM<QsFRBum7JS|%eb<KDraZ`ROgttbeYPXiaI;UZs!W}#Qe(#_t85GwxbEII#
zv7-|MCgII4yk9X?{#a=J?`$}pLKzaOhx{S&ZlNl9sywjSiGa%dI?DB7dAdM*2KRGp
z6*U}K2Pc||S(k|+^RVU2CRFd;*PBuoUc$`$KOpjnqX4=Wx?;WWK~3NKwkWR;Jqeq)
zX{7Km^{o+LI9;g`_Qk(ryj{B?1yzyrE<pwn2j^a*QM;C-4}#O+V*eUTK}gug6ob;>
ze%wl3e`%BSHjXdp-5JNAc2$;s+eAN8@hTkpc$pJ_Sr+D>hq@<k$7dz+7$WW*dG>Oz
z#I=JP+0IV`>wQL6qc?FC>G)4+7`{5Wre9jfeSy3`Zj#*notFi|0=anEiO5>1lrH<I
z{&daUbQ6}`vLW2+aJ;MiknC(YQ9V9BJl5?#SW)ogj`*r}HLuGoD~vXLn0UX>N$LSO
z9;N4q33|i(s}`iyF`_r!zAWWRS+dK&DUv((>;bV!Z>C}&&p($>z8l7Nd`>4(nunv4
z$R?(p0q>LY++cmD)*0JbEf2Isua<>*%I$hwKQQELrSfusLDHNRFgVtndj`=)T!?ml
zKJ}w6kRHH)j01#f(Mv>Dz$Y~h2li<7n_`YHz$r35eaP~^aPFC39LCZHkMfqG)ESXO
z@zo1ldSK3FZW{T|<xM%huNw4t&Y&7G2~vH{p>_H_k{WcsX{<YV1(*X@ixjx!ka>gA
zbKHVC(UM1R#N*WlR&qLr{!TE?#YUIQ9Kg*ska56v#1Vo45h^7`xJ<xV+%HW2t7mqh
z)_UMXPr0h!=t6VS&&0Vu8V}_@jqqqz8s2XQJncNsJasC6m?D#r+7NiSU_ItE9|te1
zwUg8=mDFjXu}nMR;~mcxDjm1jOanj0sKM_a;bAw(ufPme6{FZZiLSpO)<O!+F035j
z+2t5`eA0Czi<%GWEh842rm)3$M>adCrL0z&<J^<-$mJ&Nd60E}F)zl6)5MX(ZMfv%
zU(<ufnuLsyW+2w3sN^)If!S}~&L+9+`hIy1uCReH(WJObYRcR#5Y^l;yLrqjsnF?}
z&%hwWQ4vh5G^^O(|1aOBcv0q`%2_>O)ff=DgoAj^G1e5=%@?Ws-GQyVrsZK-ELLE!
zQ#>?sg!k&WH;uy5=5_N)?ofO<h6_u|>k&wrQT7YZVpHWfqLj$$?3opd=Z?Xgb6cdo
zmG2*UPS|9b-eytmAWsbDX69F-@WU5(O$V<zQIztVhNqwVvGo(;_B`Ri4G`P(WrG#6
z)4U8%3NQ8(hv1?_P3s4Cn+0KRRVPYNZ1B0R@!(O05w48RATe=7L=bNH3;t<4=eF4h
zAnRGB*c~}}>Bs}K7+srx>NzBx%GHqxKpbkWovXFCctch@@VjN>Sr$3Xrm4#GT>ga^
znr$v-(Cla(7IcroTXXow<>eQF{ge_}Vl-C2m1~1jeOv=&Hw1@A^!Gf3lGirpjOQNd
zKTZqk4qMKd@2Qe3UfCCcqT+#@3Q|uGoB<G3eM|3NhJ*VBegV8mcU89!S%}-RW9+?v
zgsl5g!Y)YfrUPd<p;OAx0P3g2seZOgMV*x&6VE+<m-`zQh&*7>V-j4U?!d$Oahv^;
zJIa65g=6cGNNSK5;+SL&P3*e0(rlhC@f?!15?g%-p@91WgDc3x=7NWhj&-4j<doI^
Ztv}FbpV-oGQckS3_OrwC8b``A{XdgG9`67E


From 18352cfa9d43ce50b7cc4895dfd04bb0fe7fb456 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 14 Jun 2024 16:00:13 -0700
Subject: [PATCH 04/30] WAR for pthread detection

---
 CMakeLists.txt     | 2 ++
 src/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae509fb..08a326c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,8 @@ set(CMAKE_CXX_EXTENSIONS ON)
 
 # Define the project
 project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C)
+set(CMAKE_C_FLAGS "-pthread")
+set(CMAKE_CXX_FLAGS "-pthread")
 
 # DSA_XENGINE may be built to run using CUDA. Future version may be
 # written for HIP or SYCL, which we call the
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54467d7..fbd3f3a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -88,7 +88,7 @@ endif()
 
 # DMH: XGPU dependencies
 add_executable(dsaX_wrangle dsaX_wrangle.c)
-target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} )
+target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ${CUDA_nvml_LIBRARY})
 
 add_executable(dsaX_testdada dsaX_testdada.c)
 target_link_libraries(dsaX_testdada ${PSRDada_LIB})

From 4082f9b0fe443698d4e8adcf613000e1f94acc17 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:35:39 -0700
Subject: [PATCH 05/30] Create CMakeLists for automatic linking and building of
 dependencies

---
 CMakeLists.txt     |  73 +++++++++++++++-----
 src/CMakeLists.txt | 169 ++++++++-------------------------------------
 2 files changed, 82 insertions(+), 160 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08a326c..d4328d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,9 +17,15 @@ set(CMAKE_CXX_EXTENSIONS ON)
 
 # Define the project
 project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C)
+
+# For GCC 8 and lower, set -pthread flag manually
 set(CMAKE_C_FLAGS "-pthread")
 set(CMAKE_CXX_FLAGS "-pthread")
 
+# add a directory for cmake modules                                                                                                                                                                                
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+include(cmake/CPM.cmake)
+
 # DSA_XENGINE may be built to run using CUDA. Future version may be
 # written for HIP or SYCL, which we call the
 # Target type. By default, the target is CUDA.
@@ -97,8 +103,46 @@ if(GIT_FOUND)
   endif()
 endif(GIT_FOUND)
 
-# EXTERNALS
+# Use ExternalProject_Add for libtcc (borks with FetchContent)
+# Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch)
+include(ExternalProject)
+
+# Get TCC dependency
+option(DSA_XENGINE_USE_TCC "Use TensorCoreCorrelators for correlatorss" ON)
+if(DSA_XENGINE_USE_TCC)
+  option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
+  if(DSA_XENGINE_DOWNLOAD_TCC)
+    ExternalProject_Add(TCC
+      GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
+      #GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
+      )
+  else()
+    find_package(libtcc REQUIRED)
+  endif()
+endif()
+  
+# Get CUTLASS dependency
+option(DSA_XENGINE_USE_CUTLASS "Use CUTLASS for GEMMs" ON)
+if(DSA_XENGINE_USE_CUTLASS)
+  option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
+  if(DSA_XENGINE_DOWNLOAD_CUTLASS)
+    # Custom CUTLASS build
+    ExternalProject_Add(NvidiaCutlass
+      GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+      GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+      CMAKE_ARGS
+      "-DCUTLASS_NVCC_ARCHS_ENABLED=89"
+      "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex"
+      "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+      )
+  else()
+    find_package(NvidiaCutlass REQUIRED)
+  endif()
+endif()
+
+# Use FetchContent for lightweight dependencies
 include(FetchContent)
+
 # Get psrdada dependency
 option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
 if(DSA_XENGINE_DOWNLOAD_PSRDADA) 
@@ -108,6 +152,8 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA)
     #GIT_TAG        008afa70393ae2df11efba0cc8d0b95cda599c02
     )
   FetchContent_MakeAvailable(PSRDada)
+else()
+  find_package(psrdada REQUIRED)
 endif()
 
 # Get XGPU dependency
@@ -119,27 +165,16 @@ if(DSA_XENGINE_DOWNLOAD_XGPU)
     #GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
     )
   FetchContent_MakeAvailable(XGPU)
+else()
+  find_package(xGPU REQUIRED)
 endif()
 
-# Get TCC dependency
-option(DSA_XENGINE_DOWNLOAD_TCC "Download and build TCC" ON)
-if(DSA_XENGINE_DOWNLOAD_TCC) 
-  FetchContent_Declare(
-    TCC
-    GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
-    #GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
-    )
-  FetchContent_MakeAvailable(TCC)
-endif()
-add_custom_command(
-  OUTPUT "file.txt"
-  WORKING_DIRECTORY ${TCC_SOURCE_DIR}
-  COMMAND "sed -i 's/libtcc\///g' libtcc/*.h libtcc/*.cc"
-  )  
-
-
-# Add src
+# Add src, legacy
 add_subdirectory(src)
+option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
+if(DSA_XENGINE_BUILD_LEGACY)
+  add_subdirectory(legacy)
+endif()
 
 # Install project cmake targets
 include(CMakePackageConfigHelpers)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fbd3f3a..748f00b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,171 +1,58 @@
-#enable_language(CUDA)
+enable_language(CUDA)
 
+include_directories(..//include)
 include_directories(${PSRDada_SOURCE_DIR}/src)
 include_directories(${xGPU_SOURCE_DIR}/src)
+include_directories(${NvidiaCutlass_DIR}/../../../include)
+include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
 
 set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.so)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
 
+# Some simple CUTLASS examples to test linking/benching
+#------------------------------------------------------
+add_executable(planar_complex planar_complex.cu)
+target_link_libraries(planar_complex ${NvidiaCutlass_LIB})
 
-add_executable(test_write test_write.c)
-target_link_libraries(test_write ${PSRDada_LIB})
+add_executable(10_planar_complex 10_planar_complex.cu)
+target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB})
 
-add_executable(test_read test_read.c)
-target_link_libraries(test_read ${PSRDada_LIB})
+add_executable(11_planar_complex_array 11_planar_complex_array.cu)
+target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB})
+#------------------------------------------------------
 
-add_executable(dsaX_trigger dsaX_trigger.c)
-target_link_libraries(dsaX_trigger ${PSRDada_LIB})
-
-add_executable(dsaX_filTrigger dsaX_filTrigger.c)
-target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
-
-# DMH: Has a 'sigproc' dependency, low priority
-if(0)
-  add_executable(splice_offline_beams splice_offline_beams.c)
-  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
-
-  add_executable(dsaX_writeFil dsaX_writeFil.c)
-  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
-  
-  add_executable(dsaX_splice dsaX_splice.c)
-  target_link_libraries(dsaX_splice ${PSRDada_LIB})
-
-  add_executable(gpu_flagger gpu_flagger.cu)
-  target_link_libraries(gpu_flagger ${PSRDada_LIB})
-endif()
-
-add_executable(dsaX_store dsaX_store.c)
-target_link_libraries(dsaX_store ${PSRDada_LIB})
-
-add_executable(dsaX_fluff dsaX_fluff.c)
-target_link_libraries(dsaX_fluff ${PSRDada_LIB})
-
-# DMH: intrinsics compilation error
-#add_executable(dsaX_reorder dsaX_reorder.c)
-#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
-
-# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
-#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
-#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
-add_executable(dsaX_nicdb dsaX_nicdb.c)
-target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
-
-add_executable(dsaX_dbnic dsaX_dbnic.c)
-target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
-
-add_executable(dsaX_capture dsaX_capture.c)
-target_link_libraries(dsaX_capture ${PSRDada_LIB})
-
-add_executable(dsaX_capture_thread dsaX_capture_thread.c)
-target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
-
-add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
-target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
-
-add_executable(dsaX_split dsaX_split.c)
-target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
-
-add_executable(dsaX_merge dsaX_merge.c)
-target_link_libraries(dsaX_merge ${PSRDada_LIB})
-
-add_executable(dsaX_simplesplit dsaX_simplesplit.c)
-target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
-
-add_executable(dsaX_fake dsaX_fake.c)
-target_link_libraries(dsaX_fake ${PSRDada_LIB})
-
-add_executable(dsaX_splitup dsaX_splitup.c)
-target_link_libraries(dsaX_splitup ${PSRDada_LIB})
-
-add_executable(dsaX_copydb dsaX_copydb.c)
-target_link_libraries(dsaX_copydb ${PSRDada_LIB})
-
-# DMH: fitsio dependency
-if(0)
-  add_executable(dsaX_writevis dsaX_writevis.c)
-  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
-endif()
-
-# DMH: XGPU dependencies
-add_executable(dsaX_wrangle dsaX_wrangle.c)
-target_link_libraries(dsaX_wrangle ${XGPU_LIB} ${PSRDada_LIB} ${CUDA_nvml_LIBRARY})
-
-add_executable(dsaX_testdada dsaX_testdada.c)
-target_link_libraries(dsaX_testdada ${PSRDada_LIB})
+# DSA Fast Time Domain CUTLASS interface
+#---------------------------------------
+add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu)
+target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB})
+#---------------------------------------
 
+# DSA Fast Time Domain
+#---------------------
 add_executable(dsaX_bfCorr dsaX_bfCorr.cu)
 target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
-
-# DMH: Fix CUBE error
-add_executable(dsaX_xgpu dsaX_xgpu.cu)
-target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
-
-add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
-target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
-
-add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
-target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
-
-add_executable(fil2dada fil2dada.c)
-target_link_libraries(fil2dada ${PSRDada_LIB})
-
-add_executable(dumpfil dumpfil.c)
-target_link_libraries(dumpfil ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer dsaX_beamformer.cu)
-target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
-target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
-target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
+#---------------------
 
 # install step for header files
+#------------------------------
 set(DSA_XENGINE_HEADERS
   # cmake-format: sortable
   dsaX_capture.h
   dsaX_capture_manythread.h
   dsaX_capture_pcap.h
   dsaX_def.h
+  dsaX_cutlass_interface.h
   )
 install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+#------------------------------
 
 # install step for executables
+#-----------------------------
 install(TARGETS
   # cmake-format: sortable
-  dsaX_beamformer
-  dsaX_beamformer_passon
-  dsaX_xgpu
-  dsaX_reorder_raw
-  dsaX_fake
-  dsaX_capture
-  dsaX_capture_thread
-  dsaX_capture_manythread
-  dsaX_dbnic
-  dsaX_nicdb
-  dsaX_split
-  dsaX_wrangle
-  fil2dada
-  dumpfil
-  dsaX_simplesplit
-  dsaX_store
-  dsaX_trigger
-  dsaX_filTrigger
-  dsaX_beamformer_offline
-  dsaX_splitup
-  cuda_correlator
-  dsaX_copydb
   dsaX_bfCorr
-  dsaX_merge
-  
-  #fitsio dep
-  # dsaX_writevis
-  
-  #sigproc dep 
-  # dsaX_writeFil
-  # dsaX_splice
-  # gpu_flagger
   RUNTIME DESTINATION
   bin
   )
+#-----------------------------

From 3ce38717c8f7c99db5c245d67f668c06e929c4cc Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:36:57 -0700
Subject: [PATCH 06/30] Move headers to include directory

---
 include/dsaX_cutlass_interface.h | 172 +++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 include/dsaX_cutlass_interface.h

diff --git a/include/dsaX_cutlass_interface.h b/include/dsaX_cutlass_interface.h
new file mode 100644
index 0000000..5aa753e
--- /dev/null
+++ b/include/dsaX_cutlass_interface.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  Status status;
+  cudaError_t error;
+  bool passed;
+  
+  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+  bool help;
+  GemmCoord problem_size;
+  int batch_count;
+  complex<float> alpha;
+  complex<float> beta;
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(false),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    
+    CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "dsaX_cutlass_interface\n\n"
+	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+	<< "Options:\n\n"
+	<< "  --help                      If specified, displays this usage statement.\n\n"
+	<< "  --m=<int>                   GEMM M dimension\n"
+	<< "  --n=<int>                   GEMM N dimension\n"
+	<< "  --k=<int>                   GEMM K dimension\n"
+	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
+    
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+  // Half-precision input and output
+  using Element = half_t;
+  
+  // Configurations for layouts and internal computation
+  using LayoutA = ColumnMajor;
+  using LayoutB = ColumnMajor;
+  using LayoutC = ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  Handle handle;
+  
+  GemmCoord problem_size;
+  int batch_count;
+  DeviceAllocation<Element> tensor_A;
+  DeviceAllocation<Element> tensor_B;
+  DeviceAllocation<Element> tensor_C;
+  DeviceAllocation<Element> tensor_D;
+  DeviceAllocation<Element> tensor_D_ref;
+
+  DeviceAllocation<void *> ptr_A_real;
+  DeviceAllocation<void *> ptr_A_imag;
+  DeviceAllocation<void *> ptr_B_real;
+  DeviceAllocation<void *> ptr_B_imag;
+  DeviceAllocation<void *> ptr_C_real;
+  DeviceAllocation<void *> ptr_C_imag;
+  DeviceAllocation<void *> ptr_D_real;
+  DeviceAllocation<void *> ptr_D_imag;
+
+  Element *ptr_A;
+  Element *ptr_B;
+  Element *ptr_C;
+  Element *ptr_D;
+  
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+  
+  typename LayoutA::Stride::Index lda;
+  typename LayoutB::Stride::Index ldb;
+  typename LayoutC::Stride::Index ldc;
+  typename LayoutC::Stride::Index ldd;
+  
+  int64_t imag_stride_A;
+  int64_t imag_stride_B;
+  int64_t imag_stride_C;
+  int64_t imag_stride_D;
+  
+public:  
+  // Constructors
+  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+  DSA_FTD_ComplexGEMM_CUTLASS();
+  
+  // Methods
+  void initialize();  
+  Result run(Options const &options);
+  
+  bool testing;  
+};
+  

From 8a50bd400aff1201fe8fdddcc765505b44dd8142 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:37:49 -0700
Subject: [PATCH 07/30] Move some code to a legacy folder, exempt from
 installation

---
 legacy/10_planar_complex.cu          |  567 ++++++++++
 legacy/11_planar_complex_array.cu    |  628 +++++++++++
 legacy/11_planar_complex_array.cu~   |  628 +++++++++++
 legacy/CMakeLists.txt                |  121 ++
 legacy/CMakeLists.txt~               |  120 ++
 legacy/Makefile                      |  208 ++++
 legacy/correlator_header_dsaX.txt    |   38 +
 legacy/cuda_correlator               |  Bin 0 -> 34272 bytes
 legacy/dsaX_beamformer.cu            | 1128 +++++++++++++++++++
 legacy/dsaX_beamformer.cu.wrk1       | 1003 +++++++++++++++++
 legacy/dsaX_beamformer_offline.cu    |  933 ++++++++++++++++
 legacy/dsaX_beamformer_passon        |  Bin 0 -> 178600 bytes
 legacy/dsaX_beamformer_passon.cu     | 1057 ++++++++++++++++++
 legacy/dsaX_bfCorr.cu                | 1286 +++++++++++++++++++++
 legacy/dsaX_bigfake.c                |  320 ++++++
 legacy/dsaX_capture.c                | 1080 ++++++++++++++++++
 legacy/dsaX_capture.h                |  131 +++
 legacy/dsaX_capture_manythread.c     | 1115 +++++++++++++++++++
 legacy/dsaX_capture_manythread.c.bak | 1053 ++++++++++++++++++
 legacy/dsaX_capture_manythread.h     |  119 ++
 legacy/dsaX_capture_pcap.c           |  852 ++++++++++++++
 legacy/dsaX_capture_pcap.h           |   83 ++
 legacy/dsaX_capture_thread.c         | 1107 ++++++++++++++++++
 legacy/dsaX_copydb.c                 |  273 +++++
 legacy/dsaX_cuda_correlator.cu       |  309 +++++
 legacy/dsaX_cutlass_interface.cu     |  315 ++++++
 legacy/dsaX_cutlass_interface.cu~    |  315 ++++++
 legacy/dsaX_cutlass_interface.h      |  172 +++
 legacy/dsaX_cutlass_interface.h~     |  174 +++
 legacy/dsaX_dbnic.c                  |  435 ++++++++
 legacy/dsaX_dbnic.c.bak              |  381 +++++++
 legacy/dsaX_def.h                    |   98 ++
 legacy/dsaX_fake.c                   |  320 ++++++
 legacy/dsaX_filTrigger.c             |  559 ++++++++++
 legacy/dsaX_fluff.c                  |  415 +++++++
 legacy/dsaX_makeFil.c                |  276 +++++
 legacy/dsaX_merge.c                  |  580 ++++++++++
 legacy/dsaX_nicdb.c                  |  483 ++++++++
 legacy/dsaX_nicdb.c.bak              |  434 ++++++++
 legacy/dsaX_reorder.c                |  515 +++++++++
 legacy/dsaX_reorder_raw.c            |  613 ++++++++++
 legacy/dsaX_reorder_raw.c.bak        |  672 +++++++++++
 legacy/dsaX_reorder_raw.c.bak2       |  608 ++++++++++
 legacy/dsaX_simplesplit.c            |  362 ++++++
 legacy/dsaX_splice.c                 |  201 ++++
 legacy/dsaX_split.c                  |  601 ++++++++++
 legacy/dsaX_splitup.c                |  285 +++++
 legacy/dsaX_store.c                  |  218 ++++
 legacy/dsaX_testdada.c               |  161 +++
 legacy/dsaX_trigger.c                |  585 ++++++++++
 legacy/dsaX_wrangle                  |  Bin 0 -> 99600 bytes
 legacy/dsaX_wrangle.c                |  378 +++++++
 legacy/dsaX_wrangleAndWrite.c        |  365 ++++++
 legacy/dsaX_writeFil.c               |  486 ++++++++
 legacy/dsaX_writevis.c               |  428 +++++++
 legacy/dsaX_xgpu.cu                  |  375 +++++++
 legacy/dumpfil.c                     |  294 +++++
 legacy/fil2dada.c                    |  521 +++++++++
 legacy/flagger.c                     |  484 ++++++++
 legacy/gpu_flagger.cu                | 1547 ++++++++++++++++++++++++++
 legacy/planar_complex.cu             |   87 ++
 legacy/planar_complex.cu~            |   85 ++
 legacy/spectrometer_header.txt       |   38 +
 legacy/splice_offline_beams          |  Bin 0 -> 32432 bytes
 legacy/splice_offline_beams.c        |  132 +++
 legacy/test_read.c                   |  279 +++++
 legacy/test_write.c                  |  452 ++++++++
 67 files changed, 29888 insertions(+)
 create mode 100644 legacy/10_planar_complex.cu
 create mode 100644 legacy/11_planar_complex_array.cu
 create mode 100644 legacy/11_planar_complex_array.cu~
 create mode 100644 legacy/CMakeLists.txt
 create mode 100644 legacy/CMakeLists.txt~
 create mode 100644 legacy/Makefile
 create mode 100644 legacy/correlator_header_dsaX.txt
 create mode 100755 legacy/cuda_correlator
 create mode 100644 legacy/dsaX_beamformer.cu
 create mode 100644 legacy/dsaX_beamformer.cu.wrk1
 create mode 100644 legacy/dsaX_beamformer_offline.cu
 create mode 100755 legacy/dsaX_beamformer_passon
 create mode 100644 legacy/dsaX_beamformer_passon.cu
 create mode 100644 legacy/dsaX_bfCorr.cu
 create mode 100644 legacy/dsaX_bigfake.c
 create mode 100644 legacy/dsaX_capture.c
 create mode 100644 legacy/dsaX_capture.h
 create mode 100644 legacy/dsaX_capture_manythread.c
 create mode 100644 legacy/dsaX_capture_manythread.c.bak
 create mode 100644 legacy/dsaX_capture_manythread.h
 create mode 100644 legacy/dsaX_capture_pcap.c
 create mode 100644 legacy/dsaX_capture_pcap.h
 create mode 100644 legacy/dsaX_capture_thread.c
 create mode 100644 legacy/dsaX_copydb.c
 create mode 100644 legacy/dsaX_cuda_correlator.cu
 create mode 100644 legacy/dsaX_cutlass_interface.cu
 create mode 100644 legacy/dsaX_cutlass_interface.cu~
 create mode 100644 legacy/dsaX_cutlass_interface.h
 create mode 100644 legacy/dsaX_cutlass_interface.h~
 create mode 100644 legacy/dsaX_dbnic.c
 create mode 100644 legacy/dsaX_dbnic.c.bak
 create mode 100644 legacy/dsaX_def.h
 create mode 100644 legacy/dsaX_fake.c
 create mode 100644 legacy/dsaX_filTrigger.c
 create mode 100644 legacy/dsaX_fluff.c
 create mode 100644 legacy/dsaX_makeFil.c
 create mode 100644 legacy/dsaX_merge.c
 create mode 100644 legacy/dsaX_nicdb.c
 create mode 100644 legacy/dsaX_nicdb.c.bak
 create mode 100644 legacy/dsaX_reorder.c
 create mode 100644 legacy/dsaX_reorder_raw.c
 create mode 100644 legacy/dsaX_reorder_raw.c.bak
 create mode 100644 legacy/dsaX_reorder_raw.c.bak2
 create mode 100644 legacy/dsaX_simplesplit.c
 create mode 100644 legacy/dsaX_splice.c
 create mode 100644 legacy/dsaX_split.c
 create mode 100644 legacy/dsaX_splitup.c
 create mode 100644 legacy/dsaX_store.c
 create mode 100644 legacy/dsaX_testdada.c
 create mode 100644 legacy/dsaX_trigger.c
 create mode 100755 legacy/dsaX_wrangle
 create mode 100644 legacy/dsaX_wrangle.c
 create mode 100644 legacy/dsaX_wrangleAndWrite.c
 create mode 100644 legacy/dsaX_writeFil.c
 create mode 100644 legacy/dsaX_writevis.c
 create mode 100644 legacy/dsaX_xgpu.cu
 create mode 100644 legacy/dumpfil.c
 create mode 100644 legacy/fil2dada.c
 create mode 100644 legacy/flagger.c
 create mode 100644 legacy/gpu_flagger.cu
 create mode 100644 legacy/planar_complex.cu
 create mode 100644 legacy/planar_complex.cu~
 create mode 100644 legacy/spectrometer_header.txt
 create mode 100755 legacy/splice_offline_beams
 create mode 100644 legacy/splice_offline_beams.c
 create mode 100644 legacy/test_read.c
 create mode 100644 legacy/test_write.c

diff --git a/legacy/10_planar_complex.cu b/legacy/10_planar_complex.cu
new file mode 100644
index 0000000..9e0915d
--- /dev/null
+++ b/legacy/10_planar_complex.cu
@@ -0,0 +1,567 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex GEMM
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting
+  the batched strided mode.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile
+  size, and alignment. This can result in long compile times.
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+    
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
+
+    $ make 10_planar_complex
+
+    $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "10_planar_complex example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/10_planar_complex/10_planar_complex  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<ElementA> tensor_A;
+  cutlass::DeviceAllocation<ElementB> tensor_B;
+  cutlass::DeviceAllocation<ElementC> tensor_C;
+  cutlass::DeviceAllocation<ElementC> tensor_D;
+  cutlass::DeviceAllocation<ElementC> tensor_D_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched strided GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    ElementA *ptr_A = tensor_A.get();
+    ElementB *ptr_B = tensor_B.get();
+    ElementC *ptr_C = tensor_C.get();
+    ElementC *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMMs
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex(
+        problem_size.m(),                                 // GEMM M dimension
+        problem_size.n(),                                 // GEMM N dimension
+        problem_size.k(),                                 // GEMM K dimension
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+        ptr_A,                                            // Pointer to real part of A matrix
+        ptr_A + imag_stride_A,                            // Pointer to imaginary part of A matrix
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+        ptr_B,                                            // Pointer to real part of B matrix
+        ptr_B + imag_stride_B,                            // Pointer to imaginary part of B matrix
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C,                                            // Pointer to real part of C matrix
+        ptr_C + imag_stride_C,                            // Pointer to imaginary part of C matrix
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D,                                            // Pointer to real part of D matrix
+        ptr_D + imag_stride_D,                            // Pointer to imaginary part of D matrix
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd,                                              // Leading dimension of imaginary part of D matrix
+
+        batch_count,                                      // Number of batched elements
+
+        batch_stride_A,                                   // Stride between batches of real parts of A matrix
+        batch_stride_A,                                   // Stride between batches of imaginary parts of A matrix
+
+        batch_stride_B,                                   // Stride between batches of real parts of B matrix
+        batch_stride_B,                                   // Stride between batches of imaginary parts of B matrix
+
+        batch_stride_C,                                   // Stride between batches of real parts of C matrix
+        batch_stride_C,                                   // Stride between batches of imaginary parts of C matrix
+
+        batch_stride_D,                                   // Stride between batches of real parts of D matrix
+        batch_stride_D                                    // Stride between batches of imaginary parts of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMMs are complete
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+        ElementC epsilon = 0.1_hf;
+        ElementC nonzero_floor = 0.1_hf;
+
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this test passes on older architectures even though its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/legacy/11_planar_complex_array.cu b/legacy/11_planar_complex_array.cu
new file mode 100644
index 0000000..ba94b60
--- /dev/null
+++ b/legacy/11_planar_complex_array.cu
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex Array Example
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
+  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
+  in global memory.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
+
+    $ make 11_planar_complex_array
+
+    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "11_planar_complex_array example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  // Half-precision input and output
+  using Element = cutlass::half_t;
+
+  // Configurations for layouts and internal computation
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<Element> tensor_A;
+  cutlass::DeviceAllocation<Element> tensor_B;
+  cutlass::DeviceAllocation<Element> tensor_C;
+  cutlass::DeviceAllocation<Element> tensor_D;
+  cutlass::DeviceAllocation<Element> tensor_D_ref;
+
+  cutlass::DeviceAllocation<void *> ptr_A_real;
+  cutlass::DeviceAllocation<void *> ptr_A_imag;
+  cutlass::DeviceAllocation<void *> ptr_B_real;
+  cutlass::DeviceAllocation<void *> ptr_B_imag;
+  cutlass::DeviceAllocation<void *> ptr_C_real;
+  cutlass::DeviceAllocation<void *> ptr_C_imag;
+  cutlass::DeviceAllocation<void *> ptr_D_real;
+  cutlass::DeviceAllocation<void *> ptr_D_imag;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched planar complex GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+
+    ptr_A_real.reset(batch_count);
+    ptr_A_imag.reset(batch_count);
+    ptr_B_real.reset(batch_count);
+    ptr_B_imag.reset(batch_count);
+    ptr_C_real.reset(batch_count);
+    ptr_C_imag.reset(batch_count);
+    ptr_D_real.reset(batch_count);
+    ptr_D_imag.reset(batch_count);
+
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    Element *ptr_A = tensor_A.get();
+    Element *ptr_B = tensor_B.get();
+    Element *ptr_C = tensor_C.get();
+    Element *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+    
+    //
+    // Configure pointers in global memory
+    //
+
+    struct {
+      Element *base;
+      void **ptr_real;
+      void **ptr_imag;
+      int64_t batch_stride;
+      int64_t imag_stride;
+    } tensors[] = {
+      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
+    };
+
+    for (auto const &tensor : tensors) {
+      for (int idx = 0; idx < batch_count; ++idx) {
+
+        void *ptr_real = tensor.base + idx * tensor.batch_stride;
+        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
+
+        cudaError_t error = cudaMemcpy(
+          tensor.ptr_real + idx,
+          &ptr_real,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+
+        error = cudaMemcpy(
+          tensor.ptr_imag + idx,
+          &ptr_imag,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+      }
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex array GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex_array(
+
+        problem_size.m(),                                 // expected GEMM M dimension
+        problem_size.n(),                                 // expected GEMM N dimension
+        problem_size.k(),                                 // expected GEMM K dimension
+        batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          Element, LayoutA,
+          Element, LayoutB,
+          Element, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+	Element epsilon = 0.1_hf;
+	Element nonzero_floor = 0.1_hf;
+	
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/legacy/11_planar_complex_array.cu~ b/legacy/11_planar_complex_array.cu~
new file mode 100644
index 0000000..23722b0
--- /dev/null
+++ b/legacy/11_planar_complex_array.cu~
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex Array Example
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
+  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
+  in global memory.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
+
+    $ make 11_planar_complex_array
+
+    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "11_planar_complex_array example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  // Half-precision input and output
+  using Element = cutlass::half_t;
+
+  // Configurations for layouts and internal computation
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<Element> tensor_A;
+  cutlass::DeviceAllocation<Element> tensor_B;
+  cutlass::DeviceAllocation<Element> tensor_C;
+  cutlass::DeviceAllocation<Element> tensor_D;
+  cutlass::DeviceAllocation<Element> tensor_D_ref;
+
+  cutlass::DeviceAllocation<void *> ptr_A_real;
+  cutlass::DeviceAllocation<void *> ptr_A_imag;
+  cutlass::DeviceAllocation<void *> ptr_B_real;
+  cutlass::DeviceAllocation<void *> ptr_B_imag;
+  cutlass::DeviceAllocation<void *> ptr_C_real;
+  cutlass::DeviceAllocation<void *> ptr_C_imag;
+  cutlass::DeviceAllocation<void *> ptr_D_real;
+  cutlass::DeviceAllocation<void *> ptr_D_imag;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched planar complex GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+
+    ptr_A_real.reset(batch_count);
+    ptr_A_imag.reset(batch_count);
+    ptr_B_real.reset(batch_count);
+    ptr_B_imag.reset(batch_count);
+    ptr_C_real.reset(batch_count);
+    ptr_C_imag.reset(batch_count);
+    ptr_D_real.reset(batch_count);
+    ptr_D_imag.reset(batch_count);
+
+  }
+
+  void initialize_rand() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    Element *ptr_A = tensor_A.get();
+    Element *ptr_B = tensor_B.get();
+    Element *ptr_C = tensor_C.get();
+    Element *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+    
+    //
+    // Configure pointers in global memory
+    //
+
+    struct {
+      Element *base;
+      void **ptr_real;
+      void **ptr_imag;
+      int64_t batch_stride;
+      int64_t imag_stride;
+    } tensors[] = {
+      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
+    };
+
+    for (auto const &tensor : tensors) {
+      for (int idx = 0; idx < batch_count; ++idx) {
+
+        void *ptr_real = tensor.base + idx * tensor.batch_stride;
+        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
+
+        cudaError_t error = cudaMemcpy(
+          tensor.ptr_real + idx,
+          &ptr_real,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+
+        error = cudaMemcpy(
+          tensor.ptr_imag + idx,
+          &ptr_imag,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+      }
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex array GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex_array(
+
+        problem_size.m(),                                 // expected GEMM M dimension
+        problem_size.n(),                                 // expected GEMM N dimension
+        problem_size.k(),                                 // expected GEMM K dimension
+        batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          Element, LayoutA,
+          Element, LayoutB,
+          Element, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+	Element epsilon = 0.1_hf;
+	Element nonzero_floor = 0.1_hf;
+	
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/legacy/CMakeLists.txt b/legacy/CMakeLists.txt
new file mode 100644
index 0000000..b456550
--- /dev/null
+++ b/legacy/CMakeLists.txt
@@ -0,0 +1,121 @@
+enable_language(CUDA)
+
+include_directories(../include)
+include_directories(${PSRDada_SOURCE_DIR}/src)
+include_directories(${xGPU_SOURCE_DIR}/src)
+
+set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+
+# DSA Fast Time Domain functions
+#-------------------------------
+add_executable(test_write test_write.c)
+target_link_libraries(test_write ${PSRDada_LIB})
+
+add_executable(test_read test_read.c)
+target_link_libraries(test_read ${PSRDada_LIB})
+
+add_executable(dsaX_trigger dsaX_trigger.c)
+target_link_libraries(dsaX_trigger ${PSRDada_LIB})
+
+add_executable(dsaX_filTrigger dsaX_filTrigger.c)
+target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
+
+# DMH: Has a 'sigproc' dependency, low priority
+if(0)
+  add_executable(splice_offline_beams splice_offline_beams.c)
+  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
+
+  add_executable(dsaX_writeFil dsaX_writeFil.c)
+  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
+  
+  add_executable(dsaX_splice dsaX_splice.c)
+  target_link_libraries(dsaX_splice ${PSRDada_LIB})
+
+  add_executable(gpu_flagger gpu_flagger.cu)
+  target_link_libraries(gpu_flagger ${PSRDada_LIB})
+endif()
+
+add_executable(dsaX_store dsaX_store.c)
+target_link_libraries(dsaX_store ${PSRDada_LIB})
+
+add_executable(dsaX_fluff dsaX_fluff.c)
+target_link_libraries(dsaX_fluff ${PSRDada_LIB})
+
+# DMH: intrinsics compilation error
+#add_executable(dsaX_reorder dsaX_reorder.c)
+#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
+
+# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
+#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
+#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+add_executable(dsaX_nicdb dsaX_nicdb.c)
+target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
+
+add_executable(dsaX_dbnic dsaX_dbnic.c)
+target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
+
+add_executable(dsaX_capture dsaX_capture.c)
+target_link_libraries(dsaX_capture ${PSRDada_LIB})
+
+add_executable(dsaX_capture_thread dsaX_capture_thread.c)
+target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
+
+add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
+target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
+
+add_executable(dsaX_split dsaX_split.c)
+target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
+
+add_executable(dsaX_merge dsaX_merge.c)
+target_link_libraries(dsaX_merge ${PSRDada_LIB})
+
+add_executable(dsaX_simplesplit dsaX_simplesplit.c)
+target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
+
+add_executable(dsaX_fake dsaX_fake.c)
+target_link_libraries(dsaX_fake ${PSRDada_LIB})
+
+add_executable(dsaX_splitup dsaX_splitup.c)
+target_link_libraries(dsaX_splitup ${PSRDada_LIB})
+
+add_executable(dsaX_copydb dsaX_copydb.c)
+target_link_libraries(dsaX_copydb ${PSRDada_LIB})
+
+# DMH: fitsio dependency
+if(0)
+  add_executable(dsaX_writevis dsaX_writevis.c)
+  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
+endif()
+
+# DMH: XGPU dependencies
+add_executable(dsaX_wrangle dsaX_wrangle.c)
+target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB})
+
+add_executable(dsaX_testdada dsaX_testdada.c)
+target_link_libraries(dsaX_testdada ${PSRDada_LIB})
+
+add_executable(dsaX_xgpu dsaX_xgpu.cu)
+target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
+
+add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
+target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+
+add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
+target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
+
+add_executable(fil2dada fil2dada.c)
+target_link_libraries(fil2dada ${PSRDada_LIB})
+
+add_executable(dumpfil dumpfil.c)
+target_link_libraries(dumpfil ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer dsaX_beamformer.cu)
+target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
+target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
+target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
+#------------------------------------------------------
diff --git a/legacy/CMakeLists.txt~ b/legacy/CMakeLists.txt~
new file mode 100644
index 0000000..0783d51
--- /dev/null
+++ b/legacy/CMakeLists.txt~
@@ -0,0 +1,120 @@
+enable_language(CUDA)
+
+include_directories(${PSRDada_SOURCE_DIR}/src)
+include_directories(${xGPU_SOURCE_DIR}/src)
+
+set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+
+# DSA Fast Time Domain functions
+#-------------------------------
+add_executable(test_write test_write.c)
+target_link_libraries(test_write ${PSRDada_LIB})
+
+add_executable(test_read test_read.c)
+target_link_libraries(test_read ${PSRDada_LIB})
+
+add_executable(dsaX_trigger dsaX_trigger.c)
+target_link_libraries(dsaX_trigger ${PSRDada_LIB})
+
+add_executable(dsaX_filTrigger dsaX_filTrigger.c)
+target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
+
+# DMH: Has a 'sigproc' dependency, low priority
+if(0)
+  add_executable(splice_offline_beams splice_offline_beams.c)
+  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
+
+  add_executable(dsaX_writeFil dsaX_writeFil.c)
+  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
+  
+  add_executable(dsaX_splice dsaX_splice.c)
+  target_link_libraries(dsaX_splice ${PSRDada_LIB})
+
+  add_executable(gpu_flagger gpu_flagger.cu)
+  target_link_libraries(gpu_flagger ${PSRDada_LIB})
+endif()
+
+add_executable(dsaX_store dsaX_store.c)
+target_link_libraries(dsaX_store ${PSRDada_LIB})
+
+add_executable(dsaX_fluff dsaX_fluff.c)
+target_link_libraries(dsaX_fluff ${PSRDada_LIB})
+
+# DMH: intrinsics compilation error
+#add_executable(dsaX_reorder dsaX_reorder.c)
+#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
+
+# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
+#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
+#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+add_executable(dsaX_nicdb dsaX_nicdb.c)
+target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
+
+add_executable(dsaX_dbnic dsaX_dbnic.c)
+target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
+
+add_executable(dsaX_capture dsaX_capture.c)
+target_link_libraries(dsaX_capture ${PSRDada_LIB})
+
+add_executable(dsaX_capture_thread dsaX_capture_thread.c)
+target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
+
+add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
+target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
+
+add_executable(dsaX_split dsaX_split.c)
+target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
+
+add_executable(dsaX_merge dsaX_merge.c)
+target_link_libraries(dsaX_merge ${PSRDada_LIB})
+
+add_executable(dsaX_simplesplit dsaX_simplesplit.c)
+target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
+
+add_executable(dsaX_fake dsaX_fake.c)
+target_link_libraries(dsaX_fake ${PSRDada_LIB})
+
+add_executable(dsaX_splitup dsaX_splitup.c)
+target_link_libraries(dsaX_splitup ${PSRDada_LIB})
+
+add_executable(dsaX_copydb dsaX_copydb.c)
+target_link_libraries(dsaX_copydb ${PSRDada_LIB})
+
+# DMH: fitsio dependency
+if(0)
+  add_executable(dsaX_writevis dsaX_writevis.c)
+  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
+endif()
+
+# DMH: XGPU dependencies
+add_executable(dsaX_wrangle dsaX_wrangle.c)
+target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB})
+
+add_executable(dsaX_testdada dsaX_testdada.c)
+target_link_libraries(dsaX_testdada ${PSRDada_LIB})
+
+add_executable(dsaX_xgpu dsaX_xgpu.cu)
+target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
+
+add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
+target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+
+add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
+target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
+
+add_executable(fil2dada fil2dada.c)
+target_link_libraries(fil2dada ${PSRDada_LIB})
+
+add_executable(dumpfil dumpfil.c)
+target_link_libraries(dumpfil ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer dsaX_beamformer.cu)
+target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
+target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
+target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
+#------------------------------------------------------
diff --git a/legacy/Makefile b/legacy/Makefile
new file mode 100644
index 0000000..0de1991
--- /dev/null
+++ b/legacy/Makefile
@@ -0,0 +1,208 @@
+# This is set up for the CORR containers
+
+CC=gcc
+CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc
+CDEPS1=dsaX_def.h dsaX_capture_manythread.h
+CDEPS2=dsaX_def.h dsaX_capture.h
+LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu
+
+#LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap
+#CDEPS3=dsaX_def.h dsaX_capture_pcap.h
+
+CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++
+CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14
+
+
+.DEFAULT_GOAL := all
+
+test_write.o: test_write.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+test_write: test_write.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+test_read.o: test_read.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+test_read: test_read.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_trigger.o: dsaX_trigger.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_trigger: dsaX_trigger.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_filTrigger.o: dsaX_filTrigger.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_filTrigger: dsaX_filTrigger.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+splice_offline_beams.o: splice_offline_beams.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+splice_offline_beams: splice_offline_beams.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_store.o: dsaX_store.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_store: dsaX_store.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_fluff.o: dsaX_fluff.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_fluff: dsaX_fluff.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_reorder.o: dsaX_reorder.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_reorder: dsaX_reorder.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_dbnic: dsaX_dbnic.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_nicdb.o: dsaX_nicdb.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_nicdb: dsaX_nicdb.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_capture.o: dsaX_capture.c $(CDEPS2)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_capture: dsaX_capture.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_capture_thread.o: dsaX_capture_thread.c $(CDEPS2)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_capture_thread: dsaX_capture_thread.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_capture_manythread.o: dsaX_capture_manythread.c $(CDEPS2)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_capture_manythread: dsaX_capture_manythread.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_split.o: dsaX_split.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_split: dsaX_split.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_merge.o: dsaX_merge.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_merge: dsaX_merge.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_simplesplit.o: dsaX_simplesplit.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_simplesplit: dsaX_simplesplit.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+
+dsaX_fake.o: dsaX_fake.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_fake: dsaX_fake.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_splitup.o: dsaX_splitup.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_splitup: dsaX_splitup.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_copydb.o: dsaX_copydb.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_copydb: dsaX_copydb.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_writevis.o: dsaX_writevis.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_writevis: dsaX_writevis.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_wrangle.o: dsaX_wrangle.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_wrangle: dsaX_wrangle.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_testdada.o: dsaX_testdada.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_testdada: dsaX_testdada.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_writeFil.o: dsaX_writeFil.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_splice.o: dsaX_splice.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_writeFil: dsaX_writeFil.o
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_reorder_raw.o: dsaX_reorder_raw.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dsaX_reorder_raw: dsaX_reorder_raw.o $(CDEPS1)
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+fil2dada.o: fil2dada.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+fil2dada: fil2dada.o $(CDEPS1)
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dumpfil.o: dumpfil.c $(CDEPS1)
+	$(CC) -c -o $@ $< $(CFLAGS1)
+
+dumpfil: dumpfil.o $(CDEPS1)
+	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
+
+dsaX_xgpu: dsaX_xgpu.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+cuda_correlator: cuda_correlator.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+gpu_flagger: gpu_flagger.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+dsaX_beamformer: dsaX_beamformer.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+dsaX_bfCorr: dsaX_bfCorr.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+dsaX_beamformer_passon: dsaX_beamformer_passon.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+dsaX_beamformer_offline: dsaX_beamformer_offline.cu
+	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
+
+.PHONY: clean all
+
+clean:
+	rm -f *.o *~ dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_beamformer_offline dsaX_splice dsaX_filTrigger cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge
+
+all: dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_capture_thread dsaX_capture_manythread dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_filTrigger dsaX_beamformer_offline dsaX_splice dsaX_splitup cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge
+
+
+
+
diff --git a/legacy/correlator_header_dsaX.txt b/legacy/correlator_header_dsaX.txt
new file mode 100644
index 0000000..c8b86e9
--- /dev/null
+++ b/legacy/correlator_header_dsaX.txt
@@ -0,0 +1,38 @@
+ACC_LEN      1                 
+BANDWIDTH    -250                   
+BW           -250                   
+CFREQ        1405                   
+CHAN_AV      0                      
+DEC          00:00:00.000           
+DSB          0                      
+FILE_SIZE    2415919104             
+FREQ         1405.000000            
+FSCRUNCH     1                      
+HDR_SIZE     4096                   
+HDR_VERSION  1.0                    
+INSTRUMENT   DSAX                   
+MODE         RAW                  
+NBEAM        1                      
+NBIT         4                      
+NCHAN        2048                   
+NDIM         1                      
+NPOL         2                      
+N_PROD       1                      
+OBSERVER     DSA                    
+OBS_OFFSET   0                      
+OBS_UNIT     SECONDS                
+OBS_VAL      0000.0000              
+PID          P000                   
+RA           00:00:00.000           
+RECEIVER     SANDY                  
+RESOLUTION   4096                   
+SOURCE       TEST                   
+TRANSFER_SIZE 126562550000000       
+TELESCOPE    DSA-10                 
+TSAMP        64                     
+TSCRUNCH     1                      
+ANTENNAS     1-2-5-3               
+NANT         2                      
+UTC_START    2015-08-07-17:07:28    
+FILE_NUMBER  0                      
+
diff --git a/legacy/cuda_correlator b/legacy/cuda_correlator
new file mode 100755
index 0000000000000000000000000000000000000000..a8b94c759c2da5b87ab4c1a740138d0ad7d75073
GIT binary patch
literal 34272
zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk
zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(`
z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J
zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9
zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N<lS
zw^^lcwtK*l_mGGddCX9qZO03V0OR{sngaV)dSpb?fC&ptwca(*V>#bzs8E6%Repv1
z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU
zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z!
z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr<HAN|f%|L7Z)-}u5Cm6x<W+YZr!p&kXR%#%95
z0G=#_qoDch+*3gQZwtu(Gs=?BPO<<#r+}T?3gF)=P@eGu_CH+!FDhVvpn&}^6tJ_Z
zfc)JB@Wul9_7%XdD`4jf1>`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$?<fPb_A
z?kHe?V*z}90Xq*DsPC->?970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1
zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq(
zZnmAPm7Rxk^4+iSF@<aUFG)K>3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{
zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA
z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`}
zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M(
z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOq<r-
zbaUvITf)J%$WYQW)(od<YbcdEd!iREy~)=fOe8{sK^8Zj2=rcd8BcA4@Lj)tQ=%~x
zjr)51@!%2~Z>ekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQp<qNrhY}$$
zC?XmGn+S>m1z(>(6sC{2`+fdIa68<A0zw#ApT$%k3Lg)Ocp{dFCc|RTY#)&G$NGW7
zGcnP3b1al#D~wwdA;L7Dj|EmubI>1(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb>
zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR
z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw
zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=>
zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa
z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5;
zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj
z5t)3_f}d}Zf760rV8MmDj<Su+wct(*ZdmXN3qH?+S6T22Ex2L9FS6h@7QEVm*IMue
z7QE4dUu?lW7W`5R?zP}@cvETKk(Zs`-No;$GK6>6SfV(!8@<vy@~HEuDKN406p|+9
z{~pJ3+>=bqCqKZ%@e>mW^Is&-E<S!t^3RiJ*B&2}{7=cVOOHPw`De(pE06D&{8QxF
zg~#_w{=4Mab;pM#e~3I!$M}%szd@c|b-Y{hUnS2jI^H4qFOX-~9QR26GvwJN$7>~j
zFL`#waYOQVlV=wkuaNv5<k|Jch2(D`&n`E9@?8M)W8~S@#!pE8M)K@p<Hsc5L%x#y
znB=b~KZE=OlHW+4U1@y3<gX&nE;PPZ@~g?S>x>Ueeg*k+$qz}so;<tIc(>#)CC{!i
z-XZx5$+OFhdnA7zd3KfYTFF<EXBQbaBwt3JU1PjL@^<nUkQb8w<CWmq6~<4#!}^~h
zZ;(GB`4`Ev>5m_i{PX15<j2P(|5Ng8>f;Yc{u%OY;^X@z{}lQ8<o8PcyX4u_$A=}~
zbA=)L_JWt8>AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp
z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@
zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc
zF7>X_c<N8YAA4!U#FuOQn8Y6)AAs8}_Gany2arBmR6Xv^PJasyljYfq^j%|3qmP$=
zd_F|HyU)4}Nr%{$q?3;P2z5=}g0p;(o)|w|B<u5#*6)z|sFAb$<G+Cv3|v72Y%HdM
zAJD+^w1E%9K>6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m
zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL)
z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKT<n;^<?9r8-JL74Pl`
zrDgBv6JBK8=!JVKQom&-b|<PUyeKurQ`HiA&@{d0)@rZU_Ihd&OnP^JLgn|Mlza$z
z!m5;bCbbn3ExS>nJs+|?{ZecB{oiXDeX^<i{wL)B<K8`&{~Y2XbvDiK8e{HN<vU{-
zUYkb#WGmlUjo1-kctq2l8zxe#D77b1-C?@?B#gK9KCW6kbq%I=%hcAoiPR4vkUoEj
zYH!Q(w_8S^*B)#tZ+$lPFhq8ZjXqYsYZJO&)5sIH*5&UepQfE5Gncy|;oaR*J><Qt
zrMkPX{C8vc`yT#&<_(RBr#`ci_`s>A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M
z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^?
z<Vf*a`Ss77Dxdy=>RQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9?
z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%;
zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JY<M{hI
ziv9!8^WVm$to*KSl=06x$$-3c(h=yuNi(Zc@4O*>G_yM6A|sEBegd(kk(X`Bt6R$N
zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm
z2IIz;q{gj~dK*%@MW*gUKC<tmJ^_Kzlo!2}!+gNIc$ysd%XjUD3%$2L%B#VatxX?j
z+S+tOlkezmoH%-vCv4aMMK?L}5{ELE%trQO=CulWjsI_?(pBE4Q?g0DR^Ii@_!s3M
zbCw$=Jyeez%;B(+<IknAA#X({DSZO%B=t6myf<@w*mPugC6tmtZxj!mjh=g>{D<+S
zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d
zOWpta#6;Ki57jEGkjs+UL7Cl<dH)R$U0p!t7RvNP#@E#KP@sU&E()!NP}BRHu7BwE
z0#Z9EHHT7Lx*oc(fY1nq-uMH`{m^$HB8&XX>s9qs-R_{wk066`A1`3%cFNoj8J7DG
z1%!qvbT@=#xhrv<Vn(@c`Us^0ltQ@|6%e|OLaQJIf43Kq`7mW>Lk8sz6_B}=GH?9e
zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc
z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@
zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k
zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^s<wvvsI5oM{D-f&
zXNiI7jp?0GOhwR@wmx2b8z9?jGN&Uauz=?sStXLiFZ~bbrH#Exg^6QZ$8Uj7raW+D
z8??uN^BOOesQJjt1KvID)lOakaXGye_W+~M<L0sg1C9WlNou(d#mFviLaS7|p(t-G
zDbL|cq`m~F<8f>jFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+
z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL
zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi%
zuO<kl9<inrsg#SE@)hQW`y^ieQ2Ci=<*&ss&LVDs18~Q5#n<aG_@0DinN{izSitRi
z;$kFy3P~L7{^ih%>GU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9
z@G$76w$azyMxSlJ<OTW6{Kz|w)RB{{&MOI{?q|CFwvWEpKKeRtY?~%#KIa`dYV$6C
zDfuEhlI7}S8UJ59nrU0IJ?$xey3UZDW2Sd^McwP;H@<3Zf4s3DzYY+c&u4d7K8m;v
zb9K!tdRGKi^sN|(u874~Y+E6sNz9ZQz0qVOVGJhY38N=y3`OIiL}**EOmG%G6zMmb
zJ35W7g{|gPwFcK(LChpC?oIa8)wK=|C3cvT<89koTer!{<c7pR48|McJFt8(xVU#{
zXz`Fg*2;NimDI>Ne@y)M<R;YB4~6}`!GUNP)4lO}U!7GeH?jV7iHlE{xa4$+OHY^B
zaJs~0r%P-+UE=b>iHm)8wbs<d@!$uod_y3(Ez}!q#W`o_jQjh8D~$PZV+-fwqmlU5
zvJGZrMVVnN>^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME(
zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-*
zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZ<msbM|lXm9g}
zlfk&*j|J(x0-@HZiA0S!N_$CRp*o|65(bt_j6ia5=rp4BN^~HK9tt06Z(*q=-WQCW
zMsu;!yh$!sp`#CCF-9(k#i_M0mRke8?q<lm;HPLWmS_UTU~n)R+hJ_!^M~U>o}I~|
zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%&
z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc
z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp<Z^GIUZS;k<qufJL
zh#FjA;)z1jcFRtP16HKaUX??rfUy@R1MRR6=g?qdW{1lx3YzBSvY%yf&xi#RNvu5v
z4AWq5G!QiA2g*ci*gq5p%!NK<zPq8%7-TO__`~nh>>}Au{M*ohOdqgpn+LGU7)*qa
zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd
zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~<Vo^;+;g*-
zFHiNFP&mjzH)3$v77Z*EG327FZzzeeAkLP{CeiUq=OLS@5GV1}z+e|Pn+Nm(#S>#F
z!JWv}W>g-hTjJK}N6g<DE-q1^T^trsi-1$veUi7s2v-+hEj*$TtC$|FS^{*(2e@?X
z??Ft{9fKw>sV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_
zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t
zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m
zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|<IG_McwI0htz4scddd%~^
z2Cb+2e;1|#8Fc?YU|qZh{-4$Vg|@4%N7nv7?XT<q_$^ApeVRuZJyQHRnfL(i4+DQr
zsy`g<@rUK8(~G<Egg=s~J*|$s)$#Sk{P-o09H_E@|LEX#x_bXVam;(a!I&g@<`L6;
zTK+G8#AJ_nPmR~NX=CeJ;m&HYyw+v`IZap&BLxrZkm3-k^IN1iKj+|xn-h+Lw*l8<
zBF9IDSMWz%|At57T>B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp
z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^
zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_*
zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUv<o$
z5Jhu}aA$0Ws}&XhP44f!sy=K7^wXK}-@^m;ThOD~{dhljtX!<yq3bp55HFUCwcs(k
zjWFU6W96bMoesJG1YYO%*HVs0dTh1Y@F?b8D~PxrzckG8I-fqrc?SN@-x_lD3q3bD
z)xe_P|DqZG*h`P!`nln0V7WG#UzHl5^|(chAy@Vi6V4SAcPaf^Gd*2LJ>Dr;Y$~9C
zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ
zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W)
zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t
zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm
zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$
z3ruwbqH<b;m|Nc<E}XlVhb25L<)MLxWjr+Uu)IMS8@N=nOjMSC&3<l=IceW1PTI=c
z+d}aWUY;@Bm?n+wIM;E~Rbch3rmXCevJc(v*pC_7m1Q4BmBrVad)DwASG~_;@h1E|
z+p#hLgyYusA5Tm?0P6hbi3u(vAOHEp!~kg52`oqA{NDvSjP$#HH8F7<bRXzR(5~kv
zCc2S6@au_*yFibF9s^B*ra&1uRx#La*(hw=D{Ql;l{oj>N-BuUF#)zOl@jJ5zq}CD
zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG)
zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E<
zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC
z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T
zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy
zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ
z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY
z0F86c39B6UsIc<fQ-VRy-RS*q^t7hl$7Yr(mfmNLFC;VQ`EdP?|8cyI!Jy}#^*rPs
z-kpdN-)Hxbq}O>*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT;
z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwf<mkA)V=7b$tHy-41v(iQw)PV_!@
zxfv~}@JOgZ(KU)*t7xC1+Z4T1(N8J*WktWO=;Mn1MA6?W`nsZ1rknYlqv!%f8x&om
z=(URWDY{M3I~DzuqF+|@+loG}=uZ^=t)j0hI;B$Oujm3r8x&om=(URWDY{M3I~A2q
zvmDkoH?J^iI(zU+M$%a7UgEA@*pQT5-EDOZ?%E~py827Z1f#yTuA#P}cDZ;T44D1Q
zY*r`n{uYCrsFShmLl;`3*^r#JDkx6lVqX$6`4sIo=ckZtx0iegqV}Q^{LWL@oh9EQ
zu}#@0(~4I^+fh`EpFbn5_&k6`zlM0x$PGv+`Zf8Tj6(5xkYeW)=iOv&Q$9}vMMeC<
zh22?X04(C9ZSirMEV>MbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD
z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$
zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@
zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D
zZ^<RLJHYHK9c8AMN@joQUNV<S=0NGE$-GZ850u_V#%=pNbPkr5GSfw}Y-6Q=qP05P
z#~|>iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK!
z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL
zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$Qsav<N)6$<gJNMR#&d0og)?GdQ_Rc!eKP;8
z6x)G=!`I8G%-?6`KgIb~7Jwo%P9kPIiz>N2%l6qvi?O<K5p3GdV%7|258_~}Aj>Xr
zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3
zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov
zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69
z*@2{2Vd?x<M9w>`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO
zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI<BP
z|F+U7oe*>1Q20jRX21C{Qe|%%5yso7;*<mIg2o+i*_6MRQ5Re|<*SU&qn2|y&bKIa
zj9t?W3XliqTDH(qhu%AIUNOX-?Iw6YOMVrWN~fCO*EINJNS2<%A*<B!MGeL&S#^&D
zA5@_1<|Al<vbY<>yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+
zFer6ApuvZ!KF^dqsKATDknF)<nP9AX+I?mEHPs<;(e)^52mZ<gWAmFK!9Pdmf=#f+
zzp2mI7Pvt9H>JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug
zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu
z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n
zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG*
zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob
zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc
zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~-
zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P
zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3`
zm&>@*?O4zSe&<Ji1O89Kj3<g5j#5T%?qlroBHO#pnw`#?5pau~KS6@W*@gI+ly7(5
z=SUuMRNP_5yFWI*=!zX3LAX_<xf!JouMqp9eSPs@LIk2oxp!&X2H%>t_Er%H#p<w4
z=>}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52
zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j
zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=%
zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5<D)6RYFg{=@37VhU
zlT)aTJd=?)ex-vq$na_<cju%FLO4&`MW$$J8J3aNHLEW0t`<%RGs148{s7Jv&t=0B
zoSBRlxr?drB_nwKG*Hj8FLwcj5qeT?@wvX!Xs)Zvd3rJy4knV3;1&JBNH7-a1?!LX
z4qRbwP;M;jHx_QFHx|lPG&qEa+{knx&SE0in~261qKm<`>}hycR`?=#i$pSVt6CJc
z-YoNClPodGQeif+><at(2CWrhndKRdL<P1v#e2`FAH0XHOlAF5?R&7XYI4YIE9}2W
zm2C+b590OEOq-Anio}yWJ=~<#w+%1Dp%tr6K|jj&Z@MvSD^{GW+}&F(EyB7LEW8l(
z53<dnU&x&04zvjl5HetVmcb@Eb@b9Sv{^^>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1
zL<Vikrhn6D!aT-Q^fpCd=wm9nF9T^qKgd8TmL>~SOqbs#(p9-kCFQTmWW`QxLo3?E
zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-<QcW1OV<z}wE_D?FIOr&quC!NqA
zDVcxE$m$#NNwRo0kfBu33Pooqsy(XfuHZD$_zH!rQdIjgEvO<_D|(Hhxem*fe-Au&
z_ieO$CP_MQ1>T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h
zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104
zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^Viw<xq{ma<pu|4qw~$TI>aoY
z_gl8=I0R<N(|t3Of}Nd3hk6{!p!Cn2N&DI}-677milyTYaX~t6jX1J)!p)>Ogppw^
zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw
z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW
z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH
ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv
zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju
zIR8QW+>4C?Zjs@A!1J|}<puC``^gy(X)m80&W_md>)d@;ddv*fk#isU`WKV)+2O1v
zCP*iZt3%Sx<Z+Lat$foWcl^9t;rhEwlw$atv@>};{3h^9QzYy71K|8_hn`^4w;4|>
zTu*M{G|BKU1?;?5055^TeEC)Y&-Z<ibAexo{#dK}?_rg1J@9<r7vY3$KKqwq*3ZZ@
z{;w|}f1R|Sxo(1$p-<v-gr51+co>m<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ)
z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q
z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa><i`_WH9F9K(I^jxnl&#MLOxY2$pc^y&L
zDgAU(G$p?u_<mYG|I`$~mleRjDD$0s-?$0#EdQ7~kNWy`J#n1h?ws@6SHS+wzzuaS
ztYmiq`TGjsPZq#`3Y_IUp~};tGTC3i&o30P^Hu?TCj3MDy1v>EHNZKZ<lgV?LjS6O
zeD3#(mP+}I9qo%I>3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ
zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke
z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE
z5c|5hmn~h+38_3n^7}<Te=O$Tfw?ohriQQQV7k+Xz3K+}r6mg|KWUUDniw3~-k0#z
zA}sSa*2)N(g$?rSPMBmgH(!g!+}!!GFIeZsu8^Y7pXdoiTIA<>u5YhzFhAOq`O=g5
znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm
zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X
zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m
z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-j<ER*Yf`L
zP3zj4AxE3(X3npKSpdwB#?XvZnKrChv#E8nZ*$Y?_EzSQ`8buwBfm3+Z%gqjQe4?g
zPwb+_>ThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7<oICb$_FT=&
zXZId_G|l6|3J-g%<?C={&2P8mOHI!<s|5l|TR)+ePbd8mGz?7fs1F-4%5H7_##g2m
zbffx;SaxyKpM`6U1oGu#F0{zf%kRqNlPO@afN1)AW|`9QRGCIHqgc9N`OIXREEh1i
ziY33OmTl2oP|3#RZmpK@&t-2Wi9%%0;AfPVFe9#$`F!}YpXwCU&ick&z7PA!C6SDw
zOkMRZqWbH7`6|k9v1RL87PQQ7y=5n7eabF7$z0*e#?W?f9bv86FD)N?la4|w)L+XZ
zpZ%phTnuu*i^uQZDf{XZaaeqd=jO=H;p>ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ
z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^
z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI
z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+
zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1
zp};m<gQjV7%Pc=O|Lw;Op4vgn+`v*L_4LGo+jI&pd)O|{Ou;2UN3klWqsmg&-R)n`
zaNd-2NBaI*uX8jiU`~!{%38xkU~o&H*4OJHeCM0NkjKnHKfh|E4iT%qUN=c7{c0_!
zXs&(>aD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*<O_d=Y>!hEw
zetKRNI3Cr~`g*<Q1S^H0f3I$GrVSk@y#asxt+>u#uj@EjD1;kLk}kiN)AS~!^S9@k
z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!)
z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E}
zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz|
z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j<mz*gp7qz|*Xwrs
zlzu=fDw<pWpXKPM*Y_?r75KYiYsk(21teltL-t|H>3Qj~b>#4qH<P4~x%#gng-w-X
z0E3?IKCbk0<#YLWpu_rWeZ8)D;yvoyq0GLc_4WE=O6gaq`s@Cs^VjWgUrzpdo$}2M
zNI=l?CYRwgJ{9>}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C
zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL
zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ-
z9akcfG<KU@+n*^vf4iJd|MnhJf4@o6$6VRzNXaRpl<ne$5rONY){w)>ymN{u#cO&^
jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T

literal 0
HcmV?d00001

diff --git a/legacy/dsaX_beamformer.cu b/legacy/dsaX_beamformer.cu
new file mode 100644
index 0000000..afdda70
--- /dev/null
+++ b/legacy/dsaX_beamformer.cu
@@ -0,0 +1,1128 @@
+// -*- c++ -*-       
+/* will implement the 64-input beamformer 
+
+does N beams of 256
+
+order is (taking time as 8x 8.192e-6) 
+[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
+Load in 16 times at a time, so that we have (in units of what needs to be added)
+[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
+
+This should be reordered on the cpu to 
+[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
+
+The first kernel, launched with 1536 blocks of 64 threads, needs to
+ - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
+ - each thread processes 4 beams, adding everything. for each beam,
+  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
+  + add everything into output array
+Output array has order [beam, 96 frequency, 16 time]
+
+Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
+
+Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
+these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
+after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
+
+Do everything in floating point until second kernel. 
+
+Second kernel will simply add times and adjacent channels and pick leading 8 bits
+Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
+
+ */
+
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <mma.h>
+#include <cuda.h>
+#include "cuda_fp16.h"
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+#include <cuda_runtime_api.h>
+using namespace nvcuda;
+
+// global variables
+int DEBUG = 0;
+const float sep = 1.0;
+
+// kernel for summing for online bp
+// input array has order [beam, 48 frequency, 2 pol, 16 time]
+// need to output to [beam, 48 frequency]
+// run with 256*48=12288 blocks and 32 threads
+__global__
+void badder(float *input, float *output) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 256*48=12288
+  int tidx = threadIdx.x; // assume 32
+  //int fidx = 2*(bidx % 24);
+  int beamidx = (int)(bidx / 48);
+  
+  // declare shared mem
+  volatile __shared__ float data[32]; // data block to be summed  
+
+  // transfer from input to shared mem
+  data[tidx] = input[bidx*32+tidx];
+
+  // sync
+  __syncthreads();
+
+  // complete sum
+  if (tidx<16) {
+    data[tidx] += data[tidx+16]; // over pols
+    data[tidx] += data[tidx+8];
+    data[tidx] += data[tidx+4];
+    data[tidx] += data[tidx+2];
+    data[tidx] += data[tidx+1];
+  }
+  // now tidx = 0, 4, 8, 12 are what we want! 
+
+  __syncthreads();
+  
+  // store
+  if (tidx == 0) 
+    output[bidx] += data[0];
+      
+}
+
+
+// kernel for summing and requantizing
+// input array has order [beam, 48 frequency, 2 pol, 16 time]
+// need to output to [4 time, beam, 48 frequency]
+// bp is scale factor for each beam 
+// run with 256*48=12288 blocks and 32 threads
+__global__
+void adder(float *input, unsigned char *output, float *bp) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 256*48=12288
+  int tidx = threadIdx.x; // assume 32
+  //int fidx = 2*(bidx % 24);
+  int beamidx = (int)(bidx / 48);
+  
+  // declare shared mem
+  volatile __shared__ float data[32]; // data block to be summed  
+
+  // transfer from input to shared mem
+  data[tidx] = input[bidx*32+tidx];
+
+  // sync
+  __syncthreads();
+
+  // complete sum
+  if (tidx<16) {
+    data[tidx] += data[tidx+16]; // over pols
+    data[tidx] += data[tidx+2];
+    data[tidx] += data[tidx+1];
+  }
+  // now tidx = 0, 4, 8, 12 are what we want! 
+
+  __syncthreads();
+  
+  // store
+  if (tidx == 0) 
+    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
+  if (tidx == 4) 
+    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
+  if (tidx == 8) 
+    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
+  if (tidx == 12) 
+    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
+      
+}
+
+// kernel for promotion
+/*
+orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
+promoted to half precision  
+
+launch with 16*48*NANT blocks of 32 threads
+
+ */
+__global__ void promoter(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume 16*48*NANT
+  int tidx = threadIdx.x; // assume 32
+  int iidx = bidx*32+tidx;
+  int pol = (int)(tidx % 2);
+  int chunnel = (int)(tidx / 2);
+  
+  /*int ant = (int)(bidx % NANT);
+  int time_chan = (int)(bidx / NANT);    
+  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
+
+  int chan = (int)(bidx % 48);
+  int time_ant = (int)(bidx / 48);
+  int tim = (int)(time_ant / NANT);
+  int ant = (int)(time_ant % NANT);
+  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
+
+  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
+  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
+  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
+  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
+
+}
+
+// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
+// for first time, launch with 3072, 32
+__global__ void printer(half *inr, half *ini) {
+
+  int idx = blockIdx.x*32+threadIdx.x;
+  float ir = __half2float(inr[idx]);
+  float ii = __half2float(ini[idx]);
+
+  int chunnel = (int)(threadIdx.x % 16);
+  int channel = (int)(blockIdx.x/64);
+  int tt = (int)(blockIdx.x % 64);
+  int pol = (int)(tt/32);
+  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
+  
+  if (ir!=0. || ii!=0.) {
+    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
+  }
+  
+}
+
+// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
+// launch with 4,32
+__global__ void rms_printer(half *inr, half *ini) {
+
+  int idx = blockIdx.x*32+threadIdx.x;
+  int pol = (int)(idx / 64);
+  int ant = (int)(idx % 64);
+
+  float rms = 0., val;
+  for (int i=0;i<16;i++) {
+
+    idx = 786432 + 49152 + pol*64*16 + ant*16 + i;
+    
+    val = __half2float(inr[idx]);
+    rms += val*val;
+    val = __half2float(ini[idx]);
+    rms += val*val;
+
+  }
+  rms = sqrt(rms/32.);
+
+  printf("ANTPOL_RMS %d %d %f\n",ant,pol,rms);
+  
+}
+
+
+
+// kernel for beamforming
+/*
+
+Assumes that up to NANT antennas (nominally 63) are populated. 
+
+Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
+
+Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
+
+Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
+for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
+use __float2int_rn, cosf, sinf intrinsics. 
+
+Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
+Do it in tiles of 16 beams and 16 ants for 
+
+Output array has order [beam, 48 frequency, 2 pol, 16 time]
+
+inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
+
+launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
+ = 24576 blocks
+
+*/
+__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 24576
+  int tidx = threadIdx.x; // assume 32
+  int orig_bidx = (int)(bidx / 16);
+  int beam_tile = (int)(bidx % 16);
+  int stuff_tile = (int)(beam_tile % 4);
+  int data_offset = orig_bidx*1024; // offset for first part of data
+  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
+  weight_offset *= 16384;
+  int idx1, idx2;
+  int f_idx = (int)(orig_bidx % 96);
+  int tim_idx = (int)(orig_bidx / 96);
+  int oidx = f_idx*16 + tim_idx;
+  
+  // shared memory for convenience
+  __shared__ half summr[16][16]; // beam, chunnel
+  __shared__ float summi[16][16]; // beam, chunnel
+  
+  // accumulate real and imag parts into [16 beam x 16 f] fragments
+  // Declare the fragments.
+  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
+  
+  
+  // zero out accumulators
+  wmma::fill_fragment(wr_inr_frag, 0.0f);
+  wmma::fill_fragment(wr_ini_frag, 0.0f);
+  wmma::fill_fragment(wi_inr_frag, 0.0f);
+  wmma::fill_fragment(wi_ini_frag, 0.0f);
+  wmma::fill_fragment(ib_frag, 0.0f);
+
+  // IB
+  if (stuffants==2) {
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
+    
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+
+      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+
+    }
+
+  }
+
+  // one ant per beam
+  if (stuffants==1) {        
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
+    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    
+  }
+  if (stuffants!=1) {
+  
+    // loop over ant tiles
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+      
+      // copy weight and data to fragments, and multiply to accumulators
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
+      
+      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
+      
+      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
+      
+    }
+
+    // form real and imaginary matrices
+    for(int i=0; i < wr_inr_frag.num_elements; i++) {
+      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
+      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
+      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
+    }
+  }
+
+  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
+
+  __syncthreads();
+    
+  // copy back to shared mem
+  half *p1;
+  float *p2, tmp;
+  p1 = &summr[0][0];
+  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
+
+  __syncthreads();
+  
+  if (stuffants!=1) {
+
+    // now do thread reduction using multiplication by unity
+    wmma::fill_fragment(final_frag, 0.0f);
+    wmma::fill_fragment(b_frag, 1.0f);
+    wmma::load_matrix_sync(a_frag, p1, 16);
+    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
+    
+    __syncthreads();
+
+    // store
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
+    }
+
+
+  }
+
+  if (stuffants==1) {
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
+    }
+  }
+  if (stuffants==2) {
+
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
+    tmp = 0.;
+    for (int i=0;i<16;i++) tmp += summi[i][i];
+    if (tidx==0 && beam_tile==0) 
+      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
+
+  }      
+  
+}
+
+// kernel to calculate weights - needed because weights are halfs
+// launch with 256 threads in 6144 blocks
+__global__
+void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
+
+  // assume 256 threads in 6144 blocks
+  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
+  int tidx = threadIdx.x;
+  int f = (int)(bidx / 128);
+  int cc = (int)(bidx % 128);
+  int pol = (int)(cc / 64);
+  cc = (int)(cc % 64);
+  int beam_tile = (int)(cc / 4);
+  int ant_tile = (int)(cc % 4);
+  int beam_i = (int)(tidx / 16);
+  int ant_i = (int)(tidx % 16);
+
+  int beam = beam_tile*16+beam_i;
+  int ant = ant_tile*16+ant_i;
+  int i = bidx*256+tidx;
+  int widx = ant*NW*2*2 + f*2*2 + pol*2;
+  
+  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
+  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
+  float twr = cos(afac*antpos[ant]);
+  float twi = sin(afac*antpos[ant]);
+
+  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
+  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
+  
+  
+}  
+ 
+  
+// function prototypes
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
+void reorder_block(char *block);
+void calc_bp(float *data, float *bp, int pr);
+void calc_allbp(float *data, float *bp);
+void ret_med_bp(float *bp);
+void ret_many_bp(float *many_bp, float *bp);
+
+// performs massive summation to calculate bp
+// input array has order [beam, 96 frequency, 16 time]
+// bp has size 48 - no way to avoid strided memory access
+// returns factor to correct data
+void calc_bp(float *data, float *bp, int pr) {
+
+  int i=0;
+  
+  for (int b=0;b<256;b++) {
+    for (int f=0;f<48;f++) {
+      for (int a=0;a<32;a++) {
+	bp[b] += data[i];
+	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
+	i++;
+      }
+    }
+  }
+
+}
+
+void calc_allbp(float *data, float *bp) {
+
+  int i=0;
+
+  for (int st=0;st<NSTREAMS;st++) {
+    for (int b=0;b<256;b++) {
+      for (int f=0;f<48;f++) {
+	bp[b] += data[i];
+	i++;
+      }
+    }
+  }
+
+}
+
+
+// for finding median of bandpass
+
+int cmpfunc(const void* elem1, const void* elem2)
+{
+  if(*(const float*)elem1 < *(const float*)elem2)
+    return -1;
+  return *(const float*)elem1 > *(const float*)elem2;
+}
+
+void ret_med_bp(float *bp) {
+
+  qsort(bp, 256, sizeof(float), cmpfunc);
+  float medval = 0.5*(bp[127]+bp[128]);
+  for (int i=0;i<256;i++)
+    bp[i] = medval;  
+
+}
+
+void ret_many_bp(float *many_bp, float *bp, float medbp) {
+
+  for (int i=0;i<256;i++) {
+    bp[i] = 0.;
+    for (int j=0;j<NBP;j++)
+      bp[i] += many_bp[j*256+i];
+    bp[i] /= 1.*NBP;
+  }
+
+  for (int i=0;i<256;i++) {
+    if (fabs(bp[i]-medbp)/medbp>0.1)
+      bp[i] = medbp;
+  }
+
+}
+
+// performs cpu reorder of block to be loaded to GPU
+void reorder_block(char * block) {
+
+  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+  // 24576*NANT in total. 1536*NANT per time
+  
+  char * output = (char *)malloc(sizeof(char)*24576*NANT);
+  
+  for (int i=0;i<16;i++) { // over time
+    for (int j=0;j<NANT;j++) { // over ants
+      for (int k=0;k<48;k++) { // over channels
+
+	// copy 32 bytes
+	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
+	
+      }
+    }
+  }
+
+  memcpy(block,output,24576*NANT);
+  free(output);
+
+}
+
+
+// loads in weights
+int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
+
+  // assumes 64 antennas
+  // antpos: takes only easting
+  // weights: takes [ant, NW==48] 
+
+  FILE *fin;
+  FILE *fants;
+  
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
+    return 1;
+  }
+  if (!(fants=fopen(flagants,"r"))) {
+    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
+    return 1;
+  }
+
+  fread(antpos,64*sizeof(float),1,fin);
+  fread(weights,64*NW*2*2*sizeof(float),1,fin);
+  float wnorm;
+  for (int i=0;i<64*NW*2;i++) {
+    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
+    if (wnorm!=0.0) {
+      weights[2*i] /= wnorm*wnorm;
+      weights[2*i+1] /= wnorm*wnorm;
+    }
+  }
+	
+
+  int ant;
+  while (!feof(fants)) {
+    fscanf(fants,"%d\n",&ant);
+    for (int j=0;j<NW*2*2;j++) {
+      weights[ant*NW*2*2+j] = 0.0;
+    }
+  }
+      
+  fclose(fants);
+  fclose(fin);
+  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
+  return 0;
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_beamformer [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f filename for antenna stuff [no default]\n"
+	   " -i input key [default REORDER_BLOCK_KEY2]\n"
+	   " -o output key [default BF_BLOCK_KEY]\n"
+	   " -z fch1 in MHz [default 1530]\n"
+	   " -a flagants file\n"
+	   " -s stuffants \n"
+	   " -q do incoherent beam \n"
+	   " -g skip AGC \n"
+	   " -t test pattern \n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // device properties
+  int nDevices;
+
+  cudaGetDeviceCount(&nDevices);
+  for (int i = 0; i < nDevices; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    syslog(LOG_INFO,"Device Number: %d", i);
+    syslog(LOG_INFO,"  Device name: %s", prop.name);
+    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
+  }
+  cudaSetDevice(1);
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY2;
+  key_t out_key = BF_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int stuffants=0;
+  int test_pattern = 0;
+  float fch1 = 1530.0;
+  char * fnam;
+  fnam=(char *)malloc(sizeof(char)*100);
+  sprintf(fnam,"nofile");  
+  char * flagants;
+  flagants=(char *)malloc(sizeof(char)*100);
+  sprintf(flagants,"nofile");
+  int AGC = 1;
+
+  while ((arg=getopt(argc,argv,"c:f:i:o:z:a:tsqdgh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'a':
+	  if (optarg)
+	    {
+	      strcpy(flagants,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'z':
+	  if (optarg)
+	    {
+	      fch1 = atof(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-z flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'g':
+	  AGC=0;
+	  break;
+	case 't':
+	  test_pattern=1;
+	  syslog (LOG_INFO, "Will execute test pattern");
+	  break;
+	case 's':
+	  stuffants=1;
+	  syslog (LOG_INFO, "Will place antennas in output");
+	  break;
+	case 'q':
+	  stuffants=2;
+	  syslog (LOG_INFO, "Will place IB in output");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // print stuff
+  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
+  syslog(LOG_INFO,"Using calibrations file %s",fnam);
+  syslog(LOG_INFO,"Using flagants file %s",flagants);
+
+  // load in weights and antpos
+  float * antpos = (float *)malloc(sizeof(float)*64); // easting
+  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
+  float * freqs = (float *)malloc(sizeof(float)*384); // freq
+  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  int nints = NPACKETS / 16;
+  uint64_t nbytes_per_int = block_size / nints;
+  uint64_t nbytes_per_out = block_out / nints;
+  char * block;
+  unsigned char * output_buffer;
+  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+  
+  // allocate host and device memory for calculations
+  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
+  char *d_indata[NSTREAMS];
+  unsigned char *d_outdata[NSTREAMS];
+  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
+  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
+  float *d_added[NSTREAMS], *h_added;
+  h_added = (float *)malloc(sizeof(float)*256*48*NSTREAMS);
+  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
+  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
+  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
+  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
+  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
+  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
+  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
+  
+  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
+  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
+  float *bp = (float *)malloc(sizeof(float)*256);
+  float *frozen_bp = (float *)malloc(sizeof(float)*256);
+  float *many_bp = (float *)malloc(sizeof(float)*256*NBP);
+  int bpctr = 0;
+  float medbp;
+  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
+  
+  // streams and device  
+  cudaStream_t stream[NSTREAMS];
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamCreate(&stream[st]);
+    cudaMalloc((void **)&d_added[st], 256*48*sizeof(float)); // added data for each iteration
+    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
+    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
+    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
+    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
+    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
+    thrust::device_ptr<half> d1(d_inr[st]);
+    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
+    thrust::device_ptr<half> d2(d_ini[st]);
+    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
+  }
+
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  int blockct = 0;
+  int slow_down = 0;
+  int prestart = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+    
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    blockct ++;
+
+    // DO STUFF
+
+    // calc weights
+    init_weights(fnam,antpos,weights,flagants);
+    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
+    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
+    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
+
+    // zero out d_added
+    for (int st=0;st<NSTREAMS;st++)
+      cudaMemset(d_added[st], 0,  256*48*sizeof(float));
+
+    // loop over ints
+    for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+      // loop over streams
+      for (int st=0;st<NSTREAMS;st++) {	
+	
+	// copy to device
+	cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	// do promotion
+	promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+
+	// do printing if needed
+	if (bst==0 && slow_down==0) 
+	  rms_printer<<<4, 32, 0, stream[st]>>>(d_inr[st], d_ini[st]);
+	  
+	// run beamformer kernel
+	beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+
+	// run badder kernel
+	badder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_added[st]);
+	       
+	// if sufficient bandpasses...
+	if (started>0) {
+
+	  // run adder kernel
+	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
+	  
+	  // copy to host
+	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
+
+	  // copy to output
+	  for (int j=0;j<12288*4;j++) {
+	    if (test_pattern) 
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
+	    else
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
+	  }
+	  if (DEBUG && bst*NSTREAMS+st==10) {
+	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
+	  }
+
+	}
+		  
+      }
+    }
+
+    // now deal with bandpass
+
+    // copy to host
+    for (int st=0;st<NSTREAMS;st++)
+      cudaMemcpy(h_added + 256*48*st, d_added[st], 256*48*sizeof(float), cudaMemcpyDeviceToHost);
+
+    // calculate bp
+    for (int i=0;i<256;i++) bp[i] = 0.;
+    calc_allbp(h_added, bp);
+
+    // place in correct location
+    for (int i=0;i<256;i++)
+      many_bp[i + 256*(bpctr % NBP)] = bp[i];
+
+    // deal with bp for data correction
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+
+      // do median bp
+      ret_med_bp(bp);
+      medbp = bp[100];
+      for (int i=0;i<256;i++) frozen_bp[i] = medbp;
+      
+      // junk into output
+      memset(output_buffer,0,block_out);
+      
+    }
+
+    if (started>0 && bpctr<NBP) 
+      ret_med_bp(bp);
+      
+    
+    if (started>0 && bpctr>=NBP) {
+      
+      //syslog(LOG_INFO,"now using many BPs for requant");      
+      
+      // do average bp
+      ret_many_bp(many_bp,bp,medbp);	
+
+      started=2;
+      
+    }
+
+    
+
+    // finally deal with bp
+    for (int i=0;i<256;i++) {
+
+      if (AGC==0)
+	for (int i=0;i<256;i++) bp[i] = frozen_bp[i];
+      
+      if (bpctr<15) syslog(LOG_INFO,"coeff %d %d %g",bpctr,i,bp[i]);
+      if (bp[i]!=0.) {
+	bp[i] /= 48.*nints; 
+	bp[i] = 2.5*128./bp[i];
+      }
+    }
+    cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
+
+    bpctr++;
+    slow_down++;
+    if (slow_down>=20) slow_down=0;
+    
+    // write to output
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamDestroy(stream[st]);
+    cudaFree(d_indata[st]);
+    cudaFree(d_outdata[st]);
+    cudaFree(d_transfer[st]);
+    cudaFree(d_inr[st]);
+    cudaFree(d_ini[st]);
+    cudaFree(d_added[st]);
+  }
+  free(fnam);
+  free(flagants);
+  free(h_indata);
+  free(output_buffer);
+  free(antpos);
+  free(weights);
+  free(freqs);
+  free(bp);
+  free(many_bp);
+  free(h_transfer);
+  free(h_added);
+  free(tmp_buf);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_antpos);
+  cudaFree(d_freqs);
+  cudaFree(d_weights);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_bp);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_beamformer.cu.wrk1 b/legacy/dsaX_beamformer.cu.wrk1
new file mode 100644
index 0000000..5724b60
--- /dev/null
+++ b/legacy/dsaX_beamformer.cu.wrk1
@@ -0,0 +1,1003 @@
+// -*- c++ -*-       
+/* will implement the 64-input beamformer 
+
+does N beams of 256
+
+order is (taking time as 8x 8.192e-6) 
+[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
+Load in 16 times at a time, so that we have (in units of what needs to be added)
+[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
+
+This should be reordered on the cpu to 
+[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
+
+The first kernel, launched with 1536 blocks of 64 threads, needs to
+ - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
+ - each thread processes 4 beams, adding everything. for each beam,
+  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
+  + add everything into output array
+Output array has order [beam, 96 frequency, 16 time]
+
+Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
+
+Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
+these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
+after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
+
+Do everything in floating point until second kernel. 
+
+Second kernel will simply add times and adjacent channels and pick leading 8 bits
+Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
+
+ */
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <mma.h>
+#include <cuda.h>
+#include "cuda_fp16.h"
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+#include <cuda_runtime_api.h>
+using namespace nvcuda;
+
+// global variables
+int DEBUG = 0;
+
+
+// kernel for summing and requantizing
+// input array has order [beam, 48 frequency, 2 pol, 16 time]
+// need to output to [4 time, beam, 48 frequency]
+// bp is scale factor for each beam 
+// run with 256*48=12288 blocks and 32 threads
+__global__
+void adder(float *input, unsigned char *output, float *bp) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 256*48=12288
+  int tidx = threadIdx.x; // assume 32
+  //int fidx = 2*(bidx % 24);
+  int beamidx = (int)(bidx / 48);
+  
+  // declare shared mem
+  volatile __shared__ float data[32]; // data block to be summed  
+
+  // transfer from input to shared mem
+  data[tidx] = input[bidx*32+tidx];
+
+  // sync
+  __syncthreads();
+
+  // complete sum
+  if (tidx<16) {
+    data[tidx] += data[tidx+16]; // over pols
+    data[tidx] += data[tidx+2];
+    data[tidx] += data[tidx+1];
+  }
+  // now tidx = 0, 4, 8, 12 are what we want! 
+
+  __syncthreads();
+  
+  // store
+  if (tidx == 0) 
+    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
+  if (tidx == 4) 
+    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
+  if (tidx == 8) 
+    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
+  if (tidx == 12) 
+    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
+      
+}
+
+// kernel for promotion
+/*
+orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
+promoted to half precision  
+
+launch with 16*48*NANT blocks of 32 threads
+
+ */
+__global__ void promoter(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume 16*48*NANT
+  int tidx = threadIdx.x; // assume 32
+  int iidx = bidx*32+tidx;
+  int pol = (int)(tidx % 2);
+  int chunnel = (int)(tidx / 2);
+  
+  /*int ant = (int)(bidx % NANT);
+  int time_chan = (int)(bidx / NANT);    
+  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
+
+  int chan = (int)(bidx % 48);
+  int time_ant = (int)(bidx / 48);
+  int tim = (int)(time_ant / NANT);
+  int ant = (int)(time_ant % NANT);
+  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
+
+  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
+  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
+  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
+  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
+
+}
+
+// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
+// for first time, launch with 3072, 32
+__global__ void printer(half *inr, half *ini) {
+
+  int idx = blockIdx.x*32+threadIdx.x;
+  float ir = __half2float(inr[idx]);
+  float ii = __half2float(ini[idx]);
+
+  int chunnel = (int)(threadIdx.x % 16);
+  int channel = (int)(blockIdx.x/64);
+  int tt = (int)(blockIdx.x % 64);
+  int pol = (int)(tt/32);
+  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
+  
+  if (ir!=0. || ii!=0.) {
+    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
+  }
+  
+}
+
+
+// kernel for beamforming
+/*
+
+Assumes that up to NANT antennas (nominally 63) are populated. 
+
+Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
+
+Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
+
+Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
+for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
+use __float2int_rn, cosf, sinf intrinsics. 
+
+Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
+Do it in tiles of 16 beams and 16 ants for 
+
+Output array has order [beam, 48 frequency, 2 pol, 16 time]
+
+inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
+
+launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
+ = 24576 blocks
+
+*/
+__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 24576
+  int tidx = threadIdx.x; // assume 32
+  int orig_bidx = (int)(bidx / 16);
+  int beam_tile = (int)(bidx % 16);
+  int stuff_tile = (int)(beam_tile % 4);
+  int data_offset = orig_bidx*1024; // offset for first part of data
+  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
+  weight_offset *= 16384;
+  int idx1, idx2;
+  int f_idx = (int)(orig_bidx % 96);
+  int tim_idx = (int)(orig_bidx / 96);
+  int oidx = f_idx*16 + tim_idx;
+  
+  // shared memory for convenience
+  __shared__ half summr[16][16]; // beam, chunnel
+  __shared__ float summi[16][16]; // beam, chunnel
+  
+  // accumulate real and imag parts into [16 beam x 16 f] fragments
+  // Declare the fragments.
+  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
+  
+  
+  // zero out accumulators
+  wmma::fill_fragment(wr_inr_frag, 0.0f);
+  wmma::fill_fragment(wr_ini_frag, 0.0f);
+  wmma::fill_fragment(wi_inr_frag, 0.0f);
+  wmma::fill_fragment(wi_ini_frag, 0.0f);
+  wmma::fill_fragment(ib_frag, 0.0f);
+
+  // IB
+  if (stuffants==2) {
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
+    
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+
+      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+
+    }
+
+  }
+
+  // one ant per beam
+  if (stuffants==1) {        
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
+    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    
+  }
+  if (stuffants!=1) {
+  
+    // loop over ant tiles
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+      
+      // copy weight and data to fragments, and multiply to accumulators
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
+      
+      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
+      
+      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
+      
+    }
+
+    // form real and imaginary matrices
+    for(int i=0; i < wr_inr_frag.num_elements; i++) {
+      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
+      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
+      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
+    }
+  }
+
+  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
+
+  __syncthreads();
+    
+  // copy back to shared mem
+  half *p1;
+  float *p2, tmp;
+  p1 = &summr[0][0];
+  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
+
+  __syncthreads();
+  
+  if (stuffants!=1) {
+
+    // now do thread reduction using multiplication by unity
+    wmma::fill_fragment(final_frag, 0.0f);
+    wmma::fill_fragment(b_frag, 1.0f);
+    wmma::load_matrix_sync(a_frag, p1, 16);
+    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
+    
+    __syncthreads();
+
+    // store
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
+    }
+
+
+  }
+
+  if (stuffants==1) {
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
+    }
+  }
+  if (stuffants==2) {
+
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
+    tmp = 0.;
+    for (int i=0;i<16;i++) tmp += summi[i][i];
+    if (tidx==0 && beam_tile==0) 
+      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
+
+  }      
+  
+}
+
+// kernel to calculate weights - needed because weights are halfs
+// launch with 256 threads in 6144 blocks
+__global__
+void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
+
+  // assume 256 threads in 6144 blocks
+  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
+  int tidx = threadIdx.x;
+  int f = (int)(bidx / 128);
+  int cc = (int)(bidx % 128);
+  int pol = (int)(cc / 64);
+  cc = (int)(cc % 64);
+  int beam_tile = (int)(cc / 4);
+  int ant_tile = (int)(cc % 4);
+  int beam_i = (int)(tidx / 16);
+  int ant_i = (int)(tidx % 16);
+
+  int beam = beam_tile*16+beam_i;
+  int ant = ant_tile*16+ant_i;
+  int i = bidx*256+tidx;
+  int widx = ant*NW*2*2 + f*2*2 + pol*2;
+  
+  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
+  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
+  float twr = cos(afac*antpos[ant]);
+  float twi = sin(afac*antpos[ant]);
+
+  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
+  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
+  
+  
+}  
+ 
+  
+// function prototypes
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
+void reorder_block(char *block);
+void calc_bp(float *data, float *bp, int pr);
+
+
+// performs massive summation to calculate bp
+// input array has order [beam, 96 frequency, 16 time]
+// bp has size 48 - no way to avoid strided memory access
+// returns factor to correct data
+void calc_bp(float *data, float *bp, int pr) {
+
+  int i=0;
+  
+  for (int b=0;b<256;b++) {
+    for (int f=0;f<48;f++) {
+      for (int a=0;a<32;a++) {
+	bp[b] += data[i];
+	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
+	i++;
+      }
+    }
+  }
+
+}
+
+// for finding median of bandpass
+
+int cmpfunc(const void* elem1, const void* elem2)
+{
+  if(*(const float*)elem1 < *(const float*)elem2)
+    return -1;
+  return *(const float*)elem1 > *(const float*)elem2;
+}
+
+void ret_med_bp(float *bp) {
+
+  qsort(bp, 256, sizeof(float), cmpfunc);
+  float medval = 0.5*(bp[127]+bp[128]);
+  for (int i=0;i<256;i++)
+    bp[i] = medval;  
+
+}
+
+// performs cpu reorder of block to be loaded to GPU
+void reorder_block(char * block) {
+
+  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+  // 24576*NANT in total. 1536*NANT per time
+  
+  char * output = (char *)malloc(sizeof(char)*24576*NANT);
+  
+  for (int i=0;i<16;i++) { // over time
+    for (int j=0;j<NANT;j++) { // over ants
+      for (int k=0;k<48;k++) { // over channels
+
+	// copy 32 bytes
+	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
+	
+      }
+    }
+  }
+
+  memcpy(block,output,24576*NANT);
+  free(output);
+
+}
+
+
+// loads in weights
+int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
+
+  // assumes 64 antennas
+  // antpos: takes only easting
+  // weights: takes [ant, NW==48] 
+
+  FILE *fin;
+  FILE *fants;
+  
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
+    return 1;
+  }
+  if (!(fants=fopen(flagants,"r"))) {
+    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
+    return 1;
+  }
+
+  fread(antpos,64*sizeof(float),1,fin);
+  fread(weights,64*NW*2*2*sizeof(float),1,fin);
+  float wnorm;
+  for (int i=0;i<64*NW*2;i++) {
+    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
+    if (wnorm!=0.0) {
+      weights[2*i] /= wnorm*wnorm;
+      weights[2*i+1] /= wnorm*wnorm;
+    }
+  }
+	
+
+  int ant;
+  while (!feof(fants)) {
+    fscanf(fants,"%d\n",&ant);
+    for (int j=0;j<NW*2*2;j++) {
+      weights[ant*NW*2*2+j] = 0.0;
+    }
+  }
+      
+  fclose(fants);
+  fclose(fin);
+  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
+  return 0;
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_beamformer [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f filename for antenna stuff [no default]\n"
+	   " -i input key [default REORDER_BLOCK_KEY2]\n"
+	   " -o output key [default BF_BLOCK_KEY]\n"
+	   " -z fch1 in MHz [default 1530]\n"
+	   " -a flagants file\n"
+	   " -s stuffants \n"
+	   " -q do incoherent beam \n"
+	   " -t test pattern \n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // device properties
+  int nDevices;
+
+  cudaGetDeviceCount(&nDevices);
+  for (int i = 0; i < nDevices; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    syslog(LOG_INFO,"Device Number: %d", i);
+    syslog(LOG_INFO,"  Device name: %s", prop.name);
+    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
+  }
+  cudaSetDevice(1);
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY2;
+  key_t out_key = BF_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int stuffants=0;
+  int test_pattern = 0;
+  float fch1 = 1530.0;
+  char * fnam;
+  fnam=(char *)malloc(sizeof(char)*100);
+  sprintf(fnam,"nofile");  
+  char * flagants;
+  flagants=(char *)malloc(sizeof(char)*100);
+  sprintf(flagants,"nofile");  
+
+  while ((arg=getopt(argc,argv,"c:f:i:o:z:a:tsqdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'a':
+	  if (optarg)
+	    {
+	      strcpy(flagants,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'z':
+	  if (optarg)
+	    {
+	      fch1 = atof(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-z flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 't':
+	  test_pattern=1;
+	  syslog (LOG_INFO, "Will execute test pattern");
+	  break;
+	case 's':
+	  stuffants=1;
+	  syslog (LOG_INFO, "Will place antennas in output");
+	  break;
+	case 'q':
+	  stuffants=2;
+	  syslog (LOG_INFO, "Will place IB in output");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // print stuff
+  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
+  syslog(LOG_INFO,"Using calibrations file %s",fnam);
+  syslog(LOG_INFO,"Using flagants file %s",flagants);
+
+  // load in weights and antpos
+  float * antpos = (float *)malloc(sizeof(float)*64); // easting
+  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
+  float * freqs = (float *)malloc(sizeof(float)*384); // freq
+  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  int nints = NPACKETS / 16;
+  uint64_t nbytes_per_int = block_size / nints;
+  uint64_t nbytes_per_out = block_out / nints;
+  char * block;
+  unsigned char * output_buffer;
+  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+  
+  // allocate host and device memory for calculations
+  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
+  char *d_indata[NSTREAMS];
+  unsigned char *d_outdata[NSTREAMS];
+  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
+  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
+  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
+  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
+  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
+  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
+  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
+  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
+  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
+  
+  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
+  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
+  float *bp = (float *)malloc(sizeof(float)*256);
+  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
+  
+  // streams and device  
+  cudaStream_t stream[NSTREAMS];
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamCreate(&stream[st]);
+    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
+    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
+    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
+    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
+    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
+    thrust::device_ptr<half> d1(d_inr[st]);
+    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
+    thrust::device_ptr<half> d2(d_ini[st]);
+    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
+  }
+
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  int blockct = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    blockct ++;
+
+    // DO STUFF
+
+    // calc weights
+    init_weights(fnam,antpos,weights,flagants);
+    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
+    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
+    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
+    
+    if (started==1) {
+
+      // loop over ints
+      for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+	for (int st=0;st<NSTREAMS;st++) {
+
+
+	  
+	  // copy to h_indata
+	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
+
+	  // rotate h_indata in place
+	  //reorder_block(h_indata);
+	  
+	  // copy to device
+	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	  // do promotion
+	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+	  
+	  // run beamformer kernel
+	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  	  
+	  // run adder kernel
+	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
+	  
+	  // copy to host
+	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
+
+	  // copy to output
+	  for (int j=0;j<12288*4;j++) {
+	    if (test_pattern) 
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
+	    else
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
+	  }
+	  if (DEBUG && bst*NSTREAMS+st==10) {
+	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
+	  }        
+	  
+	}
+      }
+
+
+    }
+    
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+
+      // calculate bandpass
+
+      for (int i=0;i<256;i++) bp[i] = 0.;
+      
+      // do standard bf but calculate bandpass
+
+      // loop over ints
+      for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+	for (int st=0;st<NSTREAMS;st++) {
+	  
+	  // copy to h_indata
+	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
+
+	  // rotate h_indata in place - this is current
+	  //reorder_block(h_indata);
+
+	  // copy to device
+	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	  // do promotion
+	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+
+	  //if (bst==0 && st==0) 
+	  //  printer<<<3072, 32>>>(d_inr,d_ini);	  
+	  
+	  // run beamformer kernel
+	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  
+	  // copy back to host
+	  cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
+
+	  // calculate bandpass
+	  //if (st==0 && bst==0) 
+	  //calc_bp(h_transfer,bp,1);
+	  calc_bp(h_transfer + st*256*96*16,bp,0);
+	  ret_med_bp(bp);
+
+	}
+      }
+
+      // adjust bandpass
+      syslog(LOG_INFO,"Final BP...");
+      for (int i=0;i<256;i++) {
+	syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
+	if (bp[i]!=0.) {
+	  bp[i] /= 48.*nints; 
+	  bp[i] = 2.5*128./bp[i];
+	}
+      }
+      cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
+      
+      // junk into output
+      memset(output_buffer,0,block_out);
+      
+    }
+
+    // write output for debug
+    
+    // write to output
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamDestroy(stream[st]);
+    cudaFree(d_indata[st]);
+    cudaFree(d_outdata[st]);
+    cudaFree(d_transfer[st]);
+    cudaFree(d_inr[st]);
+    cudaFree(d_ini[st]);
+  }
+  free(fnam);
+  free(flagants);
+  free(h_indata);
+  free(output_buffer);
+  free(antpos);
+  free(weights);
+  free(freqs);
+  free(bp);
+  free(h_transfer);
+  free(tmp_buf);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_antpos);
+  cudaFree(d_freqs);
+  cudaFree(d_weights);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_bp);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_beamformer_offline.cu b/legacy/dsaX_beamformer_offline.cu
new file mode 100644
index 0000000..c122d46
--- /dev/null
+++ b/legacy/dsaX_beamformer_offline.cu
@@ -0,0 +1,933 @@
+// -*- c++ -*-       
+/* will implement the 64-input beamformer 
+
+does N beams of 256
+
+order is (taking time as 8x 8.192e-6) 
+[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
+Load in 16 times at a time, so that we have (in units of what needs to be added)
+[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
+
+This should be reordered on the cpu to 
+[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
+
+The first kernel, launched with 1536 blocks of 64 threads, needs to
+ - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
+ - each thread processes 4 beams, adding everything. for each beam,
+  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
+  + add everything into output array
+Output array has order [beam, 96 frequency, 16 time]
+
+Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
+
+Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
+these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
+after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
+
+Do everything in floating point until second kernel. 
+
+Second kernel will simply add times and adjacent channels and pick leading 8 bits
+Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
+
+ */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <mma.h>
+#include <cuda.h>
+#include "cuda_fp16.h"
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+#include <cuda_runtime_api.h>
+using namespace nvcuda;
+
+#define sep 1.0
+
+// global variables
+int DEBUG = 0;
+
+
+// kernel for summing and requantizing
+// input array has order [beam, 48 frequency, 2 pol, 16 time]
+// need to output to [4 time, beam, 48 frequency]
+// bp is scale factor for each beam 
+// run with 256*48=12288 blocks and 32 threads
+__global__
+void adder(float *input, unsigned char *output, float *bp) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 256*48=12288
+  int tidx = threadIdx.x; // assume 32
+  //int fidx = 2*(bidx % 24);
+  int beamidx = (int)(bidx / 48);
+  
+  // declare shared mem
+  volatile __shared__ float data[32]; // data block to be summed  
+
+  // transfer from input to shared mem
+  data[tidx] = input[bidx*32+tidx];
+  
+  // sync
+  __syncthreads();
+
+  // complete sum
+  if (tidx<16) {
+    data[tidx] += data[tidx+16]; // over pols
+    data[tidx] += data[tidx+2];
+    data[tidx] += data[tidx+1];
+  }
+  // now tidx = 0, 4, 8, 12 are what we want! 
+
+  __syncthreads();
+  
+  // store
+  if (tidx == 0) 
+    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
+  if (tidx == 4) 
+    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
+  if (tidx == 8) 
+    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
+  if (tidx == 12) 
+    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
+  
+  /*if (tidx == 0)
+    output[bidx] = (unsigned char)(__float2int_rn(data[0]));
+  if (tidx == 4)
+    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]));
+  if (tidx == 8)
+    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]));
+  if (tidx == 12)
+  output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]));*/
+  
+}
+
+// kernel for promotion
+/*
+orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
+promoted to half precision  
+
+launch with 16*48*NANT blocks of 32 threads
+
+ */
+__global__ void promoter(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume 16*48*NANT
+  int tidx = threadIdx.x; // assume 32
+  int iidx = bidx*32+tidx;
+  int pol = (int)(tidx % 2);
+  int chunnel = (int)(tidx / 2);
+  
+  /*int ant = (int)(bidx % NANT);
+  int time_chan = (int)(bidx / NANT);    
+  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
+
+  int chan = (int)(bidx % 48);
+  int time_ant = (int)(bidx / 48);
+  int tim = (int)(time_ant / NANT);
+  int ant = (int)(time_ant % NANT);
+  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
+
+  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
+  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
+  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
+  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
+
+}
+
+// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
+// for first time, launch with 3072, 32
+__global__ void printer(half *inr, half *ini) {
+
+  int idx = blockIdx.x*32+threadIdx.x;
+  float ir = __half2float(inr[idx]);
+  float ii = __half2float(ini[idx]);
+
+  int chunnel = (int)(threadIdx.x % 16);
+  int channel = (int)(blockIdx.x/64);
+  int tt = (int)(blockIdx.x % 64);
+  int pol = (int)(tt/32);
+  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
+  
+  if (ir!=0. || ii!=0.) {
+    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
+  }
+  
+}
+
+
+// kernel for beamforming
+/*
+
+Assumes that up to NANT antennas (nominally 63) are populated. 
+
+Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
+
+Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
+
+Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
+for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
+use __float2int_rn, cosf, sinf intrinsics. 
+
+Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
+Do it in tiles of 16 beams and 16 ants for 
+
+Output array has order [beam, 48 frequency, 2 pol, 16 time]
+
+inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
+
+launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
+ = 24576 blocks
+
+*/
+__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 24576
+  int tidx = threadIdx.x; // assume 32
+  int orig_bidx = (int)(bidx / 16);
+  int beam_tile = (int)(bidx % 16);
+  int stuff_tile = (int)(beam_tile % 4);
+  int data_offset = orig_bidx*1024; // offset for first part of data
+  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
+  weight_offset *= 16384;
+  int idx1, idx2;
+  int f_idx = (int)(orig_bidx % 96);
+  int tim_idx = (int)(orig_bidx / 96);
+  int oidx = f_idx*16 + tim_idx;
+  
+  // shared memory for convenience
+  __shared__ half summr[16][16]; // beam, chunnel
+  __shared__ float summi[16][16]; // beam, chunnel
+  
+  // accumulate real and imag parts into [16 beam x 16 f] fragments
+  // Declare the fragments.
+  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
+  
+  
+  // zero out accumulators
+  wmma::fill_fragment(wr_inr_frag, 0.0f);
+  wmma::fill_fragment(wr_ini_frag, 0.0f);
+  wmma::fill_fragment(wi_inr_frag, 0.0f);
+  wmma::fill_fragment(wi_ini_frag, 0.0f);
+  wmma::fill_fragment(ib_frag, 0.0f);
+
+  // IB
+  if (stuffants==2) {
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
+    
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+
+      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+
+    }
+
+  }
+
+  // one ant per beam
+  if (stuffants==1) {        
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
+    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    
+  }
+  if (stuffants!=1) {
+  
+    // loop over ant tiles
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+      
+      // copy weight and data to fragments, and multiply to accumulators
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
+      
+      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
+      
+      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
+      
+    }
+
+    // form real and imaginary matrices
+    for(int i=0; i < wr_inr_frag.num_elements; i++) {
+      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
+      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
+      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
+    }
+  }
+
+  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
+
+  __syncthreads();
+    
+  // copy back to shared mem
+  half *p1;
+  float *p2, tmp;
+  p1 = &summr[0][0];
+  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
+
+  __syncthreads();
+  
+  if (stuffants!=1) {
+
+      // now do thread reduction using multiplication by unity
+    wmma::fill_fragment(final_frag, 0.0f);
+    wmma::fill_fragment(b_frag, 1.0f);
+    wmma::load_matrix_sync(a_frag, p1, 16);
+    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
+    
+    __syncthreads();
+
+    // store
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
+    }
+
+    
+    // do thread reduction for each beam    
+    /*    if (tidx<8) {
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1];
+    }
+    if (tidx>=8 && tidx<16) {
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8];
+    }
+    if (tidx>=16 && tidx<24) {
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16];
+    }
+    if (tidx>=24) {
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24];
+      }*/
+
+    /*if (tidx<16) 
+      for (int j=1;j<16;j++) summr[tidx][0] += summr[tidx][j];
+
+      __syncthreads();*/
+    
+    // now summr[beam][0] can go into output
+    /*if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0];
+      }*/
+
+  }
+
+  if (stuffants==1) {
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
+    }
+  }
+  if (stuffants==2) {
+
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
+    tmp = 0.;
+    for (int i=0;i<16;i++) tmp += summi[i][i];
+    if (tidx==0 && beam_tile==0) 
+      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
+
+  }      
+  
+}
+
+// kernel to calculate weights - needed because weights are halfs
+// launch with 256 threads in 6144 blocks
+__global__
+void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
+
+  // assume 256 threads in 6144 blocks
+  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
+  int tidx = threadIdx.x;
+  int f = (int)(bidx / 128);
+  int cc = (int)(bidx % 128);
+  int pol = (int)(cc / 64);
+  cc = (int)(cc % 64);
+  int beam_tile = (int)(cc / 4);
+  int ant_tile = (int)(cc % 4);
+  int beam_i = (int)(tidx / 16);
+  int ant_i = (int)(tidx % 16);
+
+  int beam = beam_tile*16+beam_i;
+  int ant = ant_tile*16+ant_i;
+  int i = bidx*256+tidx;
+  int widx = ant*NW*2*2 + f*2*2 + pol*2;
+  
+  //float theta = sep*(127.-beam*1.)*PI/10800.; // radians
+  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
+  float afac = -2.*PI*freqs[f*8+4]*sinf(theta)/CVAC; // factor for rotate
+  float twr = cos(afac*antpos[ant]);
+  float twi = sin(afac*antpos[ant]);
+
+  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
+  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
+  
+  
+}  
+ 
+  
+// function prototypes
+int dada_bind_thread_to_core (int core);
+int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
+void reorder_block(char *block);
+void calc_bp(float *data, float *bp, int pr);
+
+
+// performs massive summation to calculate bp
+// input array has order [beam, 96 frequency, 16 time]
+// bp has size 48 - no way to avoid strided memory access
+// returns factor to correct data
+void calc_bp(float *data, float *bp, int pr) {
+
+  int i=0;
+  
+  for (int b=0;b<256;b++) {
+    for (int f=0;f<48;f++) {
+      for (int a=0;a<32;a++) {
+	bp[b] += data[i];
+	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
+	i++;
+      }
+    }
+  }
+
+}
+
+// for finding median of bandpass
+
+int cmpfunc(const void* elem1, const void* elem2)
+{
+  if(*(const float*)elem1 < *(const float*)elem2)
+    return -1;
+  return *(const float*)elem1 > *(const float*)elem2;
+}
+
+void ret_med_bp(float *bp) {
+
+  qsort(bp, 256, sizeof(float), cmpfunc);
+  float medval = 0.5*(bp[127]+bp[128]);
+  for (int i=0;i<256;i++)
+    bp[i] = medval;  
+
+}
+
+// performs cpu reorder of block to be loaded to GPU
+void reorder_block(char * block) {
+
+  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+  // 24576*NANT in total. 1536*NANT per time
+  
+  char * output = (char *)malloc(sizeof(char)*24576*NANT);
+  
+  for (int i=0;i<16;i++) { // over time
+    for (int j=0;j<NANT;j++) { // over ants
+      for (int k=0;k<48;k++) { // over channels
+
+	// copy 32 bytes
+	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
+	
+      }
+    }
+  }
+
+  memcpy(block,output,24576*NANT);
+  free(output);
+
+}
+
+
+// loads in weights
+int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
+
+  // assumes 64 antennas
+  // antpos: takes only easting
+  // weights: takes [ant, NW==48] 
+
+  FILE *fin;
+  FILE *fants;
+  
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
+    return 1;
+  }
+  if (!(fants=fopen(flagants,"r"))) {
+    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
+    return 1;
+  }
+
+  fread(antpos,64*sizeof(float),1,fin);
+  fread(weights,64*NW*2*2*sizeof(float),1,fin);
+  float wnorm;
+  for (int i=0;i<64*NW*2;i++) {
+    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
+    if (wnorm!=0.0) {
+      weights[2*i] /= wnorm*wnorm;
+      weights[2*i+1] /= wnorm*wnorm;
+    }
+  }
+	
+
+  int ant;
+  while (!feof(fants)) {
+    fscanf(fants,"%d\n",&ant);
+    for (int j=0;j<NW*2*2;j++) {
+      weights[ant*NW*2*2+j] = 0.0;
+    }
+  }
+      
+  fclose(fants);
+  fclose(fin);
+  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
+  return 0;
+
+}
+
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_beamformer [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f filename for antenna stuff [no default]\n"
+	   " -i input data set [no default]\n"
+	   " -z fch1 in MHz [default 1530]\n"
+	   " -a flagants file\n"
+	   " -s stuffants \n"
+	   " -o out beam [default 1]\n"
+	   " -q do incoherent beam \n"
+	   " -t test pattern \n"
+	   " -p output total power time series \n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_beamformer_offline", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  cudaSetDevice(0);
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int stuffants=0;
+  int test_pattern = 0;
+  float fch1 = 1530.0;
+  char * fnam;
+  fnam=(char *)malloc(sizeof(char)*100);
+  sprintf(fnam,"nofile");  
+  char * finnam;
+  finnam=(char *)malloc(sizeof(char)*100);
+  sprintf(finnam,"nofile");
+  char * flagants;
+  flagants=(char *)malloc(sizeof(char)*100);
+  sprintf(flagants,"nofile");
+  int outbm = 1;
+  int outpwr = 0;
+
+  while ((arg=getopt(argc,argv,"c:f:i:z:a:o:ptsqdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      strcpy(finnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'p':
+	  outpwr=1;
+	  break;
+	case 'o':
+	  if (optarg)
+	    {
+	      outbm = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'a':
+	  if (optarg)
+	    {
+	      strcpy(flagants,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'z':
+	  if (optarg)
+	    {
+	      fch1 = atof(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-z flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 't':
+	  test_pattern=1;
+	  syslog (LOG_INFO, "Will execute test pattern");
+	  break;
+	case 's':
+	  stuffants=1;
+	  syslog (LOG_INFO, "Will place antennas in output");
+	  break;
+	case 'q':
+	  stuffants=2;
+	  syslog (LOG_INFO, "Will place IB in output");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // print stuff
+  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
+  syslog(LOG_INFO,"Using calibrations file %s",fnam);
+  syslog(LOG_INFO,"Using flagants file %s",flagants);
+  syslog(LOG_INFO,"Input file %s",finnam);
+  
+
+  // load in weights and antpos
+  float * antpos = (float *)malloc(sizeof(float)*64); // easting
+  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
+  float * freqs = (float *)malloc(sizeof(float)*384); // freq
+  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = 198180864;
+  uint64_t block_out = 15*48*512*256;
+  char * block;
+  block = (char *)malloc(sizeof(char)*block_size);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  int nints = NPACKETS / 16;
+  uint64_t nbytes_per_int = block_size / nints;
+  uint64_t nbytes_per_out = block_out / nints;  
+  unsigned char * output_buffer;
+  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
+  memset(output_buffer,0,block_out);
+  
+  // allocate host and device memory for calculations
+  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
+  char *d_indata[NSTREAMS];
+  unsigned char *d_outdata[NSTREAMS];
+  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
+  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
+  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
+  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
+  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
+  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
+  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
+  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
+  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
+  
+  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
+  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
+  float *bp = (float *)malloc(sizeof(float)*256);
+  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
+  
+  // streams and device  
+  cudaStream_t stream[NSTREAMS];
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamCreate(&stream[st]);
+    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
+    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
+    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
+    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
+    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
+    thrust::device_ptr<half> d1(d_inr[st]);
+    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
+    thrust::device_ptr<half> d2(d_ini[st]);
+    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
+  }
+
+    
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  int blockct = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  // init weights
+  init_weights(fnam,antpos,weights,flagants);
+  cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
+  calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
+  if (DEBUG) syslog(LOG_INFO,"Finished with weights");
+
+  // open data file and read first block
+  FILE *fin;
+  fin=fopen(finnam,"rb");
+  fread(block,sizeof(char),block_size,fin);
+  fclose(fin);
+  
+  // calculate bp
+  for (int i=0;i<256;i++) bp[i] = 0.;
+      
+  // loop over ints
+  for (int bst=0;bst<nints/NSTREAMS;bst++) {
+    
+    for (int st=0;st<NSTREAMS;st++) {
+
+      cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+      // do promotion
+      promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+	  
+      // run beamformer kernel
+      beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  
+      // copy back to host
+      cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
+
+      calc_bp(h_transfer + st*256*96*16,bp,0);
+      ret_med_bp(bp);
+
+    }
+  }
+
+
+  // adjust bandpass
+  syslog(LOG_INFO,"Final BP...");
+  for (int i=0;i<256;i++) {
+    //syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
+    if (bp[i]!=0.) {
+      bp[i] /= 48.*nints; 
+      bp[i] = 2.5*128./bp[i];
+    }
+  }
+  cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
+
+  // open data file and read first block
+  fin=fopen(finnam,"rb");
+
+  // re-open file and loop over blocks
+  while (blocks<15) {
+
+    syslog(LOG_INFO,"read blocks %d",blocks);
+    fread(block,sizeof(char),block_size,fin);
+  
+    // loop over ints
+    for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+      for (int st=0;st<NSTREAMS;st++) {
+
+	// copy to device
+	cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	// do promotion
+	promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+	  
+	// run beamformer kernel
+	beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  	  
+	// run adder kernel
+	adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
+	  
+	// copy to host
+	cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
+
+	// copy to output
+	for (int jj=0;jj<4;jj++) {
+	  for (int bmn=0;bmn<256;bmn++) {
+	    for (int j=0;j<48;j++) {
+	      output_buffer[blocks*512*48*256 + (bst*NSTREAMS+st)*48*4*256+ jj*48*256 + bmn*48 + j] = tmp_buf[256*48*4*st + jj*256*48 + bmn*48 + j];
+	    }
+	  }
+	}	
+	
+      }
+    }
+
+    blocks++;
+
+  }
+
+  syslog(LOG_INFO,"blocks %d",blocks);
+  
+  fclose(fin);
+
+  float pwrs = 0;
+  if (!outpwr) { 
+    fin=fopen("/home/ubuntu/data/tmp/output.dat","wb");  
+    for (int i=0;i<8192;i++) 
+      fwrite(output_buffer + i*48*256 + outbm*48,sizeof(unsigned char),48,fin);
+    fclose(fin);
+  }
+  else {
+    fin=fopen("/home/ubuntu/data/tmp/output.dat","w");
+    for (int i=0;i<15*512;i++) {
+      for (int j=0;j<256;j++) {
+	pwrs = 0.;
+	for (int k=0;k<48;k++) pwrs += (float)(output_buffer[i*256*48 + j*48 + k]);
+	fprintf(fin,"%f\n",pwrs);
+      }
+    }
+    fclose(fin);
+  }
+   
+  
+
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamDestroy(stream[st]);
+    cudaFree(d_indata[st]);
+    cudaFree(d_outdata[st]);
+    cudaFree(d_transfer[st]);
+    cudaFree(d_inr[st]);
+    cudaFree(d_ini[st]);
+  }  
+
+  
+  //  free(block);
+  
+  
+  free(fnam);
+  free(flagants);
+  free(h_indata);
+  free(output_buffer);
+  free(antpos);
+  free(weights);
+  free(freqs);
+  free(bp);
+  free(h_transfer);
+  free(tmp_buf);  
+  free(finnam);            
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_antpos);
+  cudaFree(d_freqs);
+  cudaFree(d_weights);
+  cudaFree(d_bp);
+  
+}
+
+
diff --git a/legacy/dsaX_beamformer_passon b/legacy/dsaX_beamformer_passon
new file mode 100755
index 0000000000000000000000000000000000000000..b08ed99873c198055c7e078c5d1cf0100e9af070
GIT binary patch
literal 178600
zcmd3Pd0<r4_5TDCh=@*5w5V93MjKoc6(v>bL^Aq<K_iQ@nm`DPvczNp!GZ=SQKn<s
zXl;uH3s!6WTCGd14G|%%iF@2ZP@+<O&rpfr5){qv^SR5LJK>F6`^PVuym!vM=iGD7
zJ@?#mml<vf7F^OJDXFJV{q*qt!k2{i>X89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+
z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX
zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK)
z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_
zaMtWuQ_nm1tl87fm_2K5Wz`u~L(e<oymQa2n0MycLT~bC-AgYYD_SBO%9oB`%G@8n
z^Z}MvrUfTt&g(ocby3#bZ7-&5dgk5QJ6JXczohGp-&HBT^t7bDy&mkB?5pnKOJZ3$
zek<{tdHihyH@#ndT+5O^udFC}dh<Vaob!6qKfXBP`ExJp$k`hmc=o0*j(GgfUmQ`D
zf8CRVgRd8WB;n5ou<sEbr9}|L3%?OcdEs{^!0$`I|CI#zH3|4^O@MELalPdE96EX7
zm%+GR`1J|!_oGw2>Mu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs-
z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-??
zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-<asy&|KB9Q`xCVL>je0pB=F~>67WAff&5ts
z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U<AtC4s(wNFe9Q3Gg!#
z*x_#p+Fg-=&ru2Fc`t!p?<8o~pMZa50{n~w?M_XApOOIoRRVbyB#3W|63Fvv0)H5p
z0AHK{|6&4pu1wJG-URrQ66l+fzz<JNQ2$8uV>*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t
z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaM<u8~IzhV=65#y_@JvSg
z`HuIk8a`D>rSttfVTZ$g#YwpWe*xs<Pc8I10rekH^~XuH?_-7c<w-soe<a2W>z66~
z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX
z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K
zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL
z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW
zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7
z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y
z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5(
z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR
zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji
zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f
zZB3XqyO2!S<WzY0Tr?ainF$V~LnMNd^A;4%E}c6gG?O~cD=)mPMAa8=3r8&~ttu%k
z6MZeD%s!_aE-`Bk3P>>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G
zD^!tR5*$4Re}b?Cb?(M3?`)wo<t?2%&6W1@%MPk}?t+<9LacnS<%SE(O&F7Z5Rp=C
zM&*?9X+`CwvrA!Y$>E2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~;
zbhGGEFof<oZU@hsfgb7BztRX(O0KV*RW70tY(5=Ptb2d|Q2i)Yi3oAQQFG>Skj#<e
zqvHDV&~(BxO5h$MQ2UC;&RsAI9X+bFq7ow$g(Ye*!GX#U%Zf^7UgxWrIcG*Gqfi-q
zDl}c<<hyS=!pmIWJUA$V1_lblF6vKV^;s43N^)|FDk@5*%$@G5KyVDr1{!g{<T^4b
znm%RLY}O{e0`5=}A~Y@JgZv_D!;7a?mVt(|0wxPd5GoJNo907VXx?n!^wN3LeH_mS
z@<3rQp*kHHW2b7CA;x?q2L+xzZ-#G9>6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair
zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<;
z1Fc}XEcc<!%30HV)1}d-FDRdd=?9K61vu6Coa=-%v}C#NBLId#U`bvmmpxKc#IB{~
zVZ=F8X3b@9%q#Oj*Ah%8gxJIMKi{PV`4<l_I{VDC&m7{rbolV1a}a?AZrnJ$pX(bw
zZd~rg`9;}ho@1iVJUiQ^JlB0wRlx0h7nf}mgB^GKdSZ8w|Md`mQ?O@9s2Be0t={-d
z6?j=k;#n&dzr9h?Q<ReC5Jl5NU?iVZUmsu(_4O58`l5t8lB_LTP5}?r=q30ON10Nw
z%Pnh3N!T`9_W2IOU(y_|%4L0Wm7FD<gc`lUB}wo}!jlm9NRxcak)w0JG#$T{$Id#$
z2VO5AAL9?p|7MgIp0e;zwz<tHXR4{`+s@dIl}|5F$)6-hKOeao%LDig4ZZ`-ruyCm
zJ#nu-aviw$_Weh}*LVH_wNreZ2LCG*eLCt(`Fa}i_42hS`B3cZKWo-uKG?Q5S|R@<
z1$?NlpMrlP;6A?N6ny9HzlWI)@tv$-$v;`}Cw=G`+M|ar3+4RjxM-?)(zYS-CjWFA
z_!$OX=f4m!uD;ISiRWIj{?U0l@$)p2PnAa#UuNKSo=*Jr27W+5%1JZozF={>frnY$
zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn
zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?}
zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^
z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB
z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y
z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un>
z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L<zKdi2&pl}UlWO2E(nvhh
z41B=A4>0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa
z#Rh(afuCvMFEj9E2Hre)tupW<jr!FF{&EAq)WBb1;Oh;1p@Cm!;IB0Bs|-B%3iZze
z27a_g;`yk7A7kJf47@zymW53Q{wkw>i-BKY;9Cv+)dqg6f!BM3EN(OK<Bj^ZfuCsL
zI}H3J1K(-jCmVR*@R<K!W8hN_e35}qGw{U*et?0WV&KyaylhDpW*YbsqkfiwpJw2P
z8u(HJA29IK4ZLOGXBzlI1Anc7A7|jNGw{U*o@dhfXQqL#(nvh5@a|MAno>9LM4wgL
z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j}
z2|p^}sf3v-I;#XcfiP1<r(VFL2tx>;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR?
z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3<Pau32x<(xJF
ze|a2Wrf^QHfcFw+>gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ(
zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM
zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V
zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4
zrZi5vfR822RK`gY@DYTW!Z<zw_aV&G#pyU8`kydU7N<?XUmgpXsfyDo;Jt*IqBu<g
z{)jMB6X#I@zekuUiL*+;ZxLoH;?xWH6~atGoN56-OPHyLQzqaIgqd<U#R6_5%v8fE
z6!7DOnPNBr0Y6BXsfCjz;6D?-kZ`(y|45iAhLa}XI|wtiaC`#(6=9|nPRCcG{|N^O
zw+Z+L!b~BYRsok2zL;>6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j
zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+
z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z
zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1
zbE<XH1^h?CoMN3c0pCHGQ>)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_<Or%Axq
z5-uV9sDP&u=2Ypd67U4VoFbii0goa)op7~)M-ZMtxJ<wo6Xpcu6btz0gl7>h6!6)E
zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX
zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe
zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz
z$l0W<WJPx7u4aa?_95(2*$X?ny2edd{c~_*g<y6QEE%h_igm`K&iHFqk4{kMLDtDe
zoub@vtIJd+lM9%x{XbUfi%PlUb0@5>OHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE
zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf
zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i=
z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb
z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@
zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M
zo9<Xg!GhWUd5V$3{s_XXV?1lF$=Hp@TaDtigReWp>!<3~_9GzWHsD%;uh}<u-t<{p
z0=<0G^CE#{L2pI!GScjJw6xEkcE;!rV8hj{yU*Y6^b8gbkW`?sl1bW}m%(|=*VwyP
zuCE~SL1d5p!lzwb;m)p*KWtA4cOJ7~S69a7wtl(3>Ha;hG*%s!>)Y#3OYwnq%rXr<
zz#=a)JHy)NpU^*Jx<B*Tz5ae<GE(hoIF1!9$Vh`da(kQBv`badfW}F>06j%R+apl4
zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg
zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA;
zuXzXQy{ffq^<s#|dPm55>%puI^@>v);6T2$)hLd<nY)@Z-rzSPd)F=^A}=Eq48EN1
zpF2AvwIDhWX6oNHJ<mTc<xnAB+1qSj$l(A+)+Ilc-3923G*R@;@mqI^=1(TP&;P;3
z>HhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O
zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70
z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia
zxT%Nel>M;#sT&N^4wP<Dq@5tW@L!?RC7o!~iY7x0(a=^j^eW!jyRV?3mr=WaeeL$D
z3nDx1@MhdkYFaoTvaPW~d)Yd65`x7pI6ClkE<Q!#p9jA2PcD`oiP}#BYgfP|(I><h
z&5vxsgm3QxrXcc#J#jS~YJ33W!@qPr)>=gV{%L`1708Exw9l3dII{C2&17(|V9;eB
zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C
z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx
zTrEUR!7=g<1zEedt1CZxQ<??45N)q7ml9neBv^!a^xiy0e#yb)Hwkjoa_N^SdOs>R
zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk
zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD
z3Ds<c$r2d2ko|_j3=o)PU^XetP=WDf`RvCPCg5UL5feQ(!-}rSATN;vHRABoXQx1t
zbv|D)yaS4?<3TM&EwgtE5Qm}!Q|-3_7S!FDQKodMDyX|HqZJXmpspsP15XQZ8VU;{
zi!!nbBCR7MUlc^9Wel~#P3Z*)>Vc7w=gBI4WMp%G<i&!>n+1_E8O3&g2v89DaIJ6E
zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I
z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC
z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^
z5V<d782}j0THn@#`N&d$!aHI3H1r#XAsx%Ua5>u#WgrKdlu?Mm3~pJRAqQA!D6Jr}
z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y
zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE
z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w
zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;P<tR`F
zgWFn>kL{<S$p}>f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y&
z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e>
zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-<E!=>yz2gE1IrYFXbA8s
z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3<pcva!q^@Na3Js7G)
z3j^A_x~?hu-rZFEzv-q?7)$J?TVNBmod#vuP3$hSn`F1ikpp{ucQ<VXLwK#lE^5L%
zeMpR+$ZZ)7grPUP$mkN-=K#G8oy!_V50OcWViMpJRdv6Td4e7kBp_`iiFOUuqijXs
z0Y9L6oH7d+Q?QT*<;;0)>pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R
zf*#6N-BUjM!>-k!Q`5x9ki<fOO||MU_uE+UEx})|hkB{?rBzp7jq20=gKF?#K0OYT
zQB$&&(^|RD%4w;5$EsV(5Q#*xmlfWEg6Aq5IDc6^XEPO8#!!<V{svRWJ=m@&=y{)r
z_u<*-Lw{{GP>q9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG
zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya<Rwyv$Zs@i#$y)Li2X<vx
zng&s706-N8nT-4kHnK1;&dmNCQ@GvmVON*)G>%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~
z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3<JmeT_Gxo}{-9$@SS2
zc6W6NgD{SBX+O<6XP{W?SB|lkY~t#F;^f>(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>|
zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_
zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y
z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S
znF^jXAYME1<g0DLKzhPr^CAL9@7y6HpPaBT&V}?pXu=u?_KY?bpNK|<;(42dbyQN(
zP*_k!AJ&6_oLTV3*YB9HL_;vRm`?6O&m(&&{=O4dI+}rSnRr+T(P?;c!Ww{Qf(dKJ
zF7AeX@183DArscO_SBy+a`l|W=F?~}^pxl^m0GZ;WY7J8Ijb6WhU<zsYbJZin6rur
zd(K&%=p6PhYZ#rxt}atd0(^3CsIDb*RDA%(#hWP7S%)A2X%$Jdcc@NfD*_K#&so)q
z^`nZlkOt*q&bsphZzo7JX9XY-+P3hZ6J+5@Cm4!n0w<XI9=A=uw-aRj&z#_{e@iDg
zgZje>Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1
z?sUNzu<q$)s$2}$k1&`pT&H#)t~@dD$}Wnb2ZWnRt<h73c(SL);hCVP8rra}Iw5x=
zS`fz45g3bAm0L6+3f=*;=0yuK3apYv8AF4$&-iN#z_TD}PoodzTunUkE*RA%|1{j$
zr!p-sn&S7J=nHZ@c45*IGclHVnM<&G5fn*lGOB$jShAU5?vhOcyF|_u;ywITvcL9y
z?D}Ggz?w8SyeT<<$rgUI1SdzXGp(da8O6Cv4v>+*_GVNq!2Hgsic?L13(c7Yk&mrt
z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<<S&iqrISHnIPSQ<KhwY@IJGh<|Gb<OM
zC;XaRR%EO~J+zXJ#ZH$0mP0U^iiPuiVnNKca{wFm(KQ3h;Cc2DJIQU$z*ZQ|PC=&<
z=pfMfz69u?cDn+y2)v_!Lb`4kI|MHf5&kN@@&U18w(k}qF3K2S{}yjdUZM(9XfIK3
z<Lm|E?ZOOuu0o~S)6`p;Jwd%y*@fb5jY!7r64|s^-`ZCxARU&l^A(Uq;6ept5;#i%
zLkXOyfB=D`6fl55Du9B>dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ*
z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n
zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#}
z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b
zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0|
zd+bxN|2_*l>Qk`SzCC5x3#>N-L<dq<;N3s?(vDzGOZmp6mch^Fyy&m7u`0k2OTW18
z+;pqt^YFVjh1aL!EFyPl!4O|v8TLK4phjNO2RMR2xkC5w*YH48pwXQgjiZfa8jb$c
zXdF{4(P(t2MyL2|uGMJts7AwuMr$;>RHJ+QYcAAi^r=Rt`fE<sXmqMZAL6e$T%*ye
z8r{cV^92vss1CYSqYw4hv}rW@Ripd*Yo5_)bgV`n=C65Np_>j5e~F%yCH|V_0^MdU
zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO&
z5T^xp18U3iSuEej#&9C!a~{D3pT<uK_9eKA;AaHkT0ZA?f}a!YL-1yTUl2TmU<JWW
zf~f?j5!_FZ!Pgl}@JoWd2wp;P8RT+?V7sz$Cf;K5IcE^piE{=gh_}@@L2@bXJWN^8
zgo#jg*J?g;R~Lg{BVU~vC~f?TZSd^Fc@GErjRy!KGWeWV34TqGHgh%+{DvS#oX`0u
zLGD1yhVBQ59Xrlh7(8-}t;VNhe5;9aR-ib)uC{tt5>B^j3-RFW&lN|m|4D4>H69l?
z^%S5G3t69i(RNN}62Ku>Qo#k1s<OrTJ5T?j!zS-Pfy+~W&E?RDxkv}-Ai)qUIPF#2
zc$kUNb)1O4hh?EM8XFq77`5MeOZRRR6POhpa){vhc_>FrF^$tG$x*1}yoQXfaUAR{
zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP
z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ
zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<<opC<l8*HU4pmi_0!MP`z5^Zx$a>e=l1~n
z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10
zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE}
zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&h<!?eqx!exb-28)llX0rb_N#|o`S}UhE9o~D
zg!WvRd-9f)UjUV~PqehI1z(V6g)j0|rnLVSbS|?`NYK?f*|`BUF?rC7{Xp&X-`dqh
zNpr`H#jS?XR^)3~GR?TgkjLF^tM>ECqpaFjLTT%YF(m94-{4*Z<ti8q3x4UhBClGJ
z{uv8!KJ!(Q6<zELO}A>dR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn
z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw<?HHQBe{ukU5}pG?kJ+S|qd+J5CtdIs)e
zFb$1Uq>49kE(A-l5eLDnIv#OgLdJ<?2qzwOi;&iEbA{XFGqF{a&hdk1hI%raBFq$v
z&vVstD4+IL*b>e#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic
zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd
zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L(
zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC
zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g
zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3
z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0
z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z><?*y7Rfu`i<*+5S+=%f4J4%0&
zJ1IZJpR!&Sc3f#MfMfmt=}(i<!#~iU<T=<6@u$pI<xlhASl#@I_v7*-pYwKHZshf`
zqp>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqa<hzh0Fe
zhc_o-b_su-Rry?QcoQxa;p_uj6{)@fNYj=aV2V?D59X5r#_s#{dhBqwABK4_U))3L
zN)KHnS_wuv_g(BCm;97qj@Pv0r~GHG$63Z3Jg<m2d6D<lK7vd@+=Xjk7$XOqS-_F)
zH(x-$8172P83@42hrwT|03jhE<n-5`h<Vl-g+C~bjkmF!TGw&6oSBOKHe9Np?ry>b
zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)<nmu~J0+@CF2@rL1j5N7#7H
zl&>9l&62O3cpb{uoGq1Guu@FVU(&>D8u>Z<EAP)=vVolQm#pVwWX`LVH)9gUHj3QS
z|K&@d3vWurozRJsofpvcar)$5m5YSobEFMx*T7uVE<f^`6?qQor5jNNlL*!|*MMBy
z?Qt5{p|-!K5|&-t+8e1>WG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA
zEd&PlJ?sNuz>}!<b7fxQKXU^*^;q=Oq5J&Tw4ej=OuH;u0hc9n+om9ApTFi-w1J|Y
zs(3sOuEswDHw4X<>-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8
zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv
z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3<P9!i~o-b>eG0vEow
zHP(YFZUMOfbi&8mH`>ebZsgM<A3T6={R~cRABIp|+f+4`mVW3CI_n;L-CY7Vo$t#8
zZm0b>i913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f
z@i*&uf<wFfj}BVUIDdgPMWp)n`Cr5+{89OmNDP0({GW=6>U+-ru=Wp(*9Xu4ZocT|
zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E%<E|U>
zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y
zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7
zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8
z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=<rC(M58>X=S2UV>mw~#{8dA2u0t-k
zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj
zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL
zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y
zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+
z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM
z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb
zn%-R~(7Ow<eFc7IkNYd%e_tVegXn^L>={s1jn^gL_<!c}jgIWQadP#rciue#-}=-)
z^3J*z?#Q+wNyS(fdDyxpVy=+_z9S&3IpuK>Jb<pSPX=qbc}vQ5)kOiU@6qG!|4{8z
z54&D4_eZuiHlT{j_fhigCHcO;ju7~cK-~dHf$t0JHD3~~D<j`S$hSf99XmSmI$Z+m
zRin?)4cSfA!+t87JU#N!+O6b^-u@cNDUHUx&Aw`Hb8F>rxeUcX_N{BBJoDic=!h)>
zf;?}vo!~2`p2J*u$Z=gg<yi>b+n3u%K&<KYTu;lz^lMx?eieC>5h+eI#2foF(Bwrv
z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm
z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U
z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2
zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG=
z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO
zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5M<L#y-35gK@1WVAE~e05kCPAA$G)J
zY%7|K{cE-r`G^<mUth<@@G&|H;rtVajLPn^&k-R~(_piw`6qYnEwEgET@bz?Hh(#H
zaNFHp2QN}BpkqbsLGV{il#3={IhLe}FH7J)1fo?c<=bg5(bPQ^?Qk>z>AtbgkCFCt
zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I
z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7
z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y-
zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2
zs<4-~(+9~Ey+j{kC+#U<KYd{ThUOwRdRlB><Ck!SGbTd|D{?_W^yJI%RuE}Z<7<~5
zUz;E=^QqW4|05L=ADZBT1H;Q-`)9sHuEW{SSw3fYmxyovC;MkC9i8T!29W;LFm~>B
zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5U<Q{Kj3ewXH!l(+C!jw(;#$ph~#DbL^ycdnn(
zQQil_b^LB_`y}Y*ET!+seSpYLIjWu$y^3=t!ydvIH?s%Ckh;LRl(Gss_+fGe9ML%!
zDB)i*U#9+bw~WU5BIdtv=jHxex8NuppHQ8N^L^!~)x);E1rhx<>rvG3)hRgaX5^o!
z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~
zDBr(L=yyEJKk+EvCd<EO`96>G)w29<w!g=t{4QC(nDn1|lrNO!LjDeq^6|3#Y0~fY
zD9=&!EdSV}{1`=lB+CEeQNDkx(Eo1S=4p7|qkNkzUyM%>HoWaozFL+~hC?^3@hHDb
zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+h<mOoAX9gp(;Z^ra*
z@hIOW%Z2{iJjz$ga-n~NNBLc{T<HI<NBKfoF7$8mC?7A&h5oG`<vFT-`ro4-<;SS@
zk3f05NBRCYr2q5M&a)on+hqCI?7!_E<*Q}6uz#aR`CYPH*#CWx@`bWo_}>PP^6|1<
z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BT<HI#NBLc{T<E{k
zqkN$(7y7UDC?7A&h5j#kl;_Cu<5QvE-#yBYk>y`=eDC%s-~XD>U&#NQNBK5cF67_g
zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{<cT?
z{#RrAH+z(CljTDHtsdp8Wx3G*UmoRm$#S88n@9OVSuXTn=TSahmJ9t~@+i+y?e~Ix
zAMq$ZMzzoJ{gFrc{#S(lV*EVgQNB%<i}CZ8NBL@5F6_VBqx>#eF6{rFNBKfoF8pu3
zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^
zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD
z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT<Uu=;v_(ih
z<qpa~)%GlnLEE!IFLqIOKC0V}?V=n<`U_ZF>`d%<Ldt%-mIm|}>h@Q<^fB4DLnfz0
z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj
zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ
z{k^)jJ<vn`MVfx+-#zq?mwyM`f%`+*nj`hs^ea8+w?ihUr><?UQ1sVp4CoKMp!N5l
zUrzery0$$@(SNc}S(N%~`avG_6G^{O*R}_G$iGO_?|j%p|9JU#z#XW6KP?gU*Yqnr
z=(j^A=Y6;R6^j04d^|#r-viHU{XOWHlm1>^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN
zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ
zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{
z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw<KRJKzon>96TmdeCo&Os4*d
zK2HB<wEiCS%SmtOuju3S*Ytxt=qHli&|lHV>96T`{-1~bq&M`3J5c{CJB<F<^ea8+
zw?ii9BwgEHq3BKd5433gJ?NK{J|_PpMgQ_AvG&(!`avG_6G?xsu5Axg^yc_kr0I7)
z;GsY1W8-HB+~FYoHT_Bt`t6X()L+rZ>EEpN_n==+dP9FjAE&>jALKzlk@SZCiat(%
zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?<G>cJzX$zt(#PV*Bt>uf
z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX
zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS
z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af
zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe%
z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH>
z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c
zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl
zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8
z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(!
zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs
z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+<Mh|`
zgFNUblHSl?(Z}hp>381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8g<y#bDbJ3E8*
z9im>SC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f
zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X
z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x
zH!gpe-<+Jmt8uxJNf~J)acyQmKCX<U=i@#}CIs^3ZUKeQz7#%XPX!0w$VlOzBPDx&
zqB7@A+_HEbHe<_ALj&GI7y=4;o-XcB;1+r(*csNT5#L1(eQNQgV*PLx#b5I=YFKq^
zGOF>!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c
zK&3yaN~epPMd7X_T<NUI;LRJn@%1)5_>Nb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU
z1=W(`u@pG<dZ&7|e+Og1x$^DZkih;Be<B<0G)lAGUrSTiJpu6Eo(;h8iz6xNl1;3l
zTtnSwK;M<&E7ouot@~9%h}DGZ;^q{4d;q^U40Z8QrYZ{^i|bo887-*sgYOsap}@G#
zavuEgM_wdsMZ5N!ckv>jv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+<?lsz+zu~Knwyp
zZCc}9DfmFiE%+m@&q#&stc`R!+<Mt#2f@yY;u~AhTz+^H?qhujue@kEuyw&Kt2yYS
zn@1+&`pujH05ee!Uqiqvk&$5fmSC0~5@a<2O2<oNFQY^o?MJ^dBpP)XBud2}Q=&g@
zP!jF5|G~BolIUSLuf;p4%{_%cybp981R8D%G(3IFa95z6_A}7U-UY&VsYWTOjy0w7
zQL2`{@lvJ6q&gl_Ii>7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz
zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX
zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b
zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{
zblmp7@-o^hs88!M47+jiVPPU0m+<US<OeIiCCE0~cR*@qABF>O=!G{V!GHGvWph06
zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u
zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag`
zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS
z=t4fXT1mR?0MX58L`m8lu=w7<KhJEWc}8>kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1
zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKm<cwbti%v
zWo?NR!eN3f(LgG``F(_a_iy1<tdW5l7QbW>lrTlgYmbPAqa!!vM?S&#0f{gAvg(Eu
zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r*
zvaK67h<ZH(QJRCo5RSOSr|6<%cUkM->tWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89
z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75<
zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf
zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J
zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e>
zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{>
zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h<xcSb5dMVLd}8QGQ>33fym
zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I
z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed
z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44
zuQk<!U}(1j5!7+7AG1d_CWkFPe9>621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_>
z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q
z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1
z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy
zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L)
zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-<gCV_lNM_xjqW3+
zA%-uh0u8KHsFV$zj-h$GYKR1NLrTEkjXO^wNXd3y2CeGlgsiDMbU0SB9t_s!#j&<b
z))O`B_J!ZVmn_#JD3I@l8NxQ?`_5mbZKTLqCf}Ye-xI{Ji3VHY?4ABw-)Bk)lQ)Y+
zW&?P}A{`?o6T;@;fhW;qbnJmBvO@&e$jkV5D}+JpCfr3qr!!$IQq3bBJQ1b3(>rcL
z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R
z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b
z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W<D-K}A
z5wVH`jEYR_+=}>2N3(rGtYVr`k?A=<mnmd~gEZTjzj5{a%Z<?Ufv78Ux+ZMqix5gh
zA`eH?=57x1OX8SCs~+q97q`buKN!cx4n^Z~DCT08qgWz0uEOLhUWK2k(Jk7vXQ3NZ
zw+tnR{lAjpHV0Dag{t-T1xm%V=-5y3)the2!1q@SOHg!hTVKNzs;%c2qpdpC7J5FO
zZ?l-7!@*BQQYzx1n8c&OPm#ymS_Z+idD?%eninh2V_m@TznHbbX+T`agZSqulK28s
zc>DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL
zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!<A^PSos-`taew5OcNaKYO}i
zF8S8LsiYes_of?miWw3jiqNJU&^~&qh}8_1XUDj4!0KRm9J!sRxwYq!m2igu#$9D!
zg3bKrkbLcZ<U7`mqCHl^Z(+Etw_LP!A}cIRpcMP&3vj8CEA9L5aWy;Hkfr&TkmW(R
zungSzvTs8z#7B45yl9H6*2EaAvkg}FkyVk)iZL5{kcTOuVe+2%oc?5;vBz~r&k)VG
zZ<Q<ZAfufS+xY@MB1Km1&|YRc_-aD4dy@sndtw}i7#uGq$G>#vh*%TH5%VNeqHiN4
zB9CM)85KDwapSltgq;*_o`@0|_vf<<MR_}iv8KQi<&py53hG)Rz;%ZesRSP^0hsLR
z*3>S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx
zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7<VMsbCAN*uG9CbP?2
zX2F;^Szy%SijxHnyZ$7V;o_)n$TcR1LtG9$!9m#u+eXrW2*bO~@GetA#sfWOr(PqR
z`!uw2qilsFbwdW5jof#Ow%VWI)M|5~BdTKfuAB`Eej|-%3{aS*uuzI2o*h=*2n=Bu
zG}2%7uZzzF9?L#~XNQPr&|m2QCSyCc)byz4c#31lrCl_cj%T;)-UBNnT&Rh!;{F7#
zM3;ms31`LUo?HD-reS+<DNcrhHfJN*+yy~<0T)Jq3ImeDLc4&&>)A$&>*6*`oB4fI
zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@)
zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|<JPZF`C!a7SPk
zM<AB<C}MKhA)eBYIP9e*;n~=r?m}qgq;XkH+A~mHOItu`ub{N#;arK<Xf_Vl<jO1g
zDh7Ad61JATCG+Dw*pEj_LEd|oyOuB)S?T1xuU7N^3=M!Ggu^gqk`Nt{X>@;wQ;!Nt
zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva
z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx
zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6<ZEicQ*C~pS9AE
z!U62pmatbVlgU8YsY7o~?q2Xz_2PD4<ze{BxRiCAj?T)%H~9h;`kkv))ZGDs6-Hu-
z8{2MsgW-4I&B`CTS%uSy_T<>U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI
zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB
z|2Ix}C`ZcXWv<}nrok<h5d0Zhh5_HX0fescRp_RZ*2yMSyp!>daSwhqrX^Jtx{-P)
zoDc!mEeI4E(OJjSLQ8sYhnfPYYb<kfCx&JpEF?PuCeqtH;l-^!kqHVPo`QIwtc6$s
z|L^9KN<4^#5MV@iY2bh%NI;voNYW$-0<^9U%a>g327#%6a7{ee@Qu+Sn0Pw8TP9=(
z%OVK4Q=<q1!X1=jF4?_`I!B(Ki7qkD^n#uEUZh|(%3zRM7SdH@X8ytN-L}(FuKlt%
z+y1Cpckd^{-6UzCnEwnkUE{$RZMH$XuL87xcWM7=(qik&P3)Dg4ZF(qwec~AKQkD<
zMTV1IhSR{XIaq@V`r$l-_h_&Ndx4uDr9Ucd9h|dhmEk7XAgyL(=5}IqOlvABqM)#H
zHdIXCQhO5!!Eq@Ww;vM?E{%?<t&aw)b2e1=mZeGI;L;=%To9}&-&y^%AgIKKmoQ9%
z1|nlNKZ<tIRs`(A!D={itiBp5XhR;Ngo8D4<?4u~#((SahFZ4@b*rsq(Y)IFNbt58
zM@&Aqx$EuK{DM}wh+-O#asBcbVe1@wWtDV-Lqk7DN4|Tf>jdT)|5Z8c{_h*L6L9V(
zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}<L
z$P65Wr6T%cwIb>?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj<B78)v>>-O$ygMq0N7
zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP
z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a
z9=pSp+033!WCQ<Pqytu|lu`|O!AcGJ5iwTNW30^B*PE=aby;zZrxv~9O>W7Xu`j$k
zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O!
z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@
zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMf<nXkZRo!Uf;)<cHL@7hmtWL<B0D&
zbQXO}1>o9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW
zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(<xbTl7<5bkm_JBH
zDids<FIkG*rPKeUTw2yNg}5?_NsI}II>HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw
zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav
zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We`
zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~
z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7<s+_5dcDE
zFmjy)=1*2T3GSj>DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB
z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9&
z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?q<k=wo)b?Htp
z=zd1JS1L7KouZ5HTxS1q2On~Qg}Qp>YFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l
z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%<S00uhe7*VGvkgpgAZg!wkL
zsXKh|$`L~CF;+BX&aYepn>obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z
zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD
z(1@V1TM&U|7xSOFqb_JqY(V}*0YiB<p%RguoC%f4&f~C{lDmDXk`#W`_U&6s9hN)$
z_EjQbv(H>iy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH
zb(FuUWOp8vzD+}Wi{A-ay<wak6My7V*FQ303O#Tyq4-ypYw;iYUfh-Uy?Uu@oISUC
zavwnM``k{u{JXfPQZoF~w<}}ZZ;x>|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W
z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k
z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW!
z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c<VYH#%6ps
zBg6-Ikz|w<W7La`Zg&~|R9&}n$7%<64t|Cq6mMRSd65UbSQm5Jz`?eNIw0$El%v@0
z`R}1Z61y)N#cnyZd8`yd5Eikly+lvAI$7y<P%hG@wTjK=Yh5;XkPTtUW}xS>_=1Q2
zcbago7$fuIm<Z=n1j0W=6UFAMS*|AOWJCCWtx0dmaI|aTvsI6!BN@UZFM1EDp2s$I
z!E6zOajj3*0tFIY-jP`oZ&8^Q;_jbM+!fz-vl|M3N{?p1;k}z_P|*fYDbY#bDpI=M
zp+oiSA@N%s^=k4pj-q?TDknM^ish(LapjRSWTCNcGO|$~z>F%y5^ktS{o9Qq?2j{4
z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji
zIQ5XrMO6XorATY+FsZd&SlA_&Qh<?pO4538W{-Rq+p(1he!)&L6EVR#4qezTG7P4}
zTpV>u%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW
zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a
zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE&
z*v($uLGfRk#hg|ioc_JcVv;NnB>#<BOm2e(xBnl^!oJns@>_iB>U(&LNm49G{u{HH
z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz!
zNJ#7a4c$r#6>&~<sa3==2?l~7&QxOJjXy|k|CV@T@@k#h-BcAnF5ckkos4E?N0fuR
zXLK**1}06UrUWr_V$R|u{ZUf*z22Csxii5vyL;teTswC$nbwcvAW+ofViSc(IAWlx
zT%{{-sjk+q;DH63<gulCN$mAz{fd9@Z7JXJBl%R<OGtpt)%V3gs|*N~%M@7ex{ge%
znO|Y1It-rx(B3Jhb~{o}hG+>%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z
zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u
zCxA1m+~o5_D6W&wmI+8cuY=pk<g=eJ7gV)z>!g9b5<gQSk>QpVl$L2uRjb=TuUG>a
zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD
z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT
zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq
zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW
z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM
zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj
z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6
z{<W<T;I@zypl--0v<XU~g#FJ+dTfjZNn0bxahtLd4x%HV(#a^2kx{EH#TCn3>C0EM
zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=<W8z*?dkc;<rLGiCrX
zt}Mh(0KsVWDoBkjl$T^-v;AwZQCAD^tZ)r~3TkP?SK_{py=<}`9miwwA&8@eRYnWT
zx4C=@VtjsP@EJ=!g)X1h4PAbPAfklu$B0iihyx@((j{J|h~?Z#i*B`|<CHwuH`$Ax
z&e^CgOvUEft<vlZs<A55PCQbv(q>-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h
z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5(
znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg&
z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6<nonn3L~zK
z6{}k=s>!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw
zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu
zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H
zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL
zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n<
zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf
zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2
zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I!
znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2<v4q3wmZ(i+#P3~=qC{++r_mNir3n%*<uP#
zlQtLaq7Nh(b79e7x7Z`DTf{nw-a1M(-tA)w?YK2&Uxkb8dZeEp-&K{QJvv4^*f7yf
z(jIHj>SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7=
zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb
zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$<cBFTE!QGtb%<Io;d1i
zV|>gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJ<KO@{})U
zX{R0v4sNHMG)j;CPZT}<I`0MgHB2ZDO3ou&C7c`~54+4xGMT+`rDjGENM%LEnm%S2
z@ij5{poQvs#6rVlfo~;gwW1tT=rA#Wk-0BHE3%o`FjE~Mw_LC%u}W8AH2IKfvFSE7
zK3!Kx6TuT+6YB&nBsR@-9l`8`&4uWM7MiXLKWa}04QzznBunN11H9ZzooUh6Q>H8K
zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${
zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL<yWd1d^<`${)x`IAuq#bu8I#`
z;WpHcd((6j)r#;8_HmUMtC-^O4+@%7oEorVZGv}kJL^}R5LL#}FQ#7(RM7LpgY2@S
z4E-LAjXL@fv!Oc_N(rO6_e8HM=1)v_jd!-eypGHpWVb<-E=R2hz@gdh=y1s^W8~)>
z<hPJ~p(MvLrpq~82_l1uyT7&yG&GhQW0+#YbaFSW8KtVrVv$dc5_vAFMv3zv>fyT?
zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~
zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F
zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`*
zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?F<r5?_0VM%~
zp!hZy{|GcLCS;&wbz_2tc4BCDAOYQK6vH{>M9r)2dV08UR3-C1;rCa)>CW4WOWa{&
z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT
zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ
z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y
zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE
z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE<n-$
zN1@;2Y(>|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I
z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z
zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a<jpz3pS}Jg$;6F_fPj6f2m>3@4@4}
zUI+8oc8S4*-V5tj@<XDXXb{tlEg!r2*b@8@m=1~gln<!0wK>o~_L>%Z?HhYdjlCwj
zuY4f`_<pnfXG_{gY-QQW5DMS86UBnK<XIR3L+n}qlSf$owXax7_`lWvFQ5NkO(*-n
z80*ab{3ey^#UJ`AlX=$+VkWiYv$Y@k@Wo@jDvzQ&EJM&|9>l@A1@q_m;l-aG9xLL6
zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2
z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e
ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A
zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`<dUPuV(5d
zez;bByKA-R5<{y!xrwy;3?)BStKCS;zK2$K=q7%+R%zX}x<6=Wb&O!l>lJD0qgVKH
zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9<oZ)#1XTIK
z3x3UK-H<2q44qFFd=XF)D4&t<m?Z|dkv}jn_L`2M%BTth^&d97x6gNDe+AXm--$Gk
zK%`LP!kx$8#G?V$t}Dch=@zoB$-gk>GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw
z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T
zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ
z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf
zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T
z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7n<C^)TS*4^_U$D!vDqTafA0JVs#jr?d$@
zLJ9CiN*nRqA8tIg2ZIE?BYxDUeY6(eNBQ8Pt`Fj)z=!YA@n-1p$NV?196$AM&p+{-
z@c5}GUa4O4#qrbH2@U@acY6?vZv5gCujqKyaQur;JYTJye)g4dd}Z^4+UdI{>VRat
z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m
zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt|
z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB
zu2YA)P7R!x{Q1<BhS&4~JAThS_t>z$JlLlWbv<?SiKhs#>z?th6WE9X|LE)UCrEoo
zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u<C&oOD<L?+Fxb
z`QFP=9B3R@eK~wX*Y`+Qiq?_dyyJ<NR6R_hyRbrZ;!(0>-Vo<W3hh<led@V54+P?f
zFt0_}2n@z1Eb@4q087Xu?FCEv%=CBG16>$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj
zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g<g#4uE&@6)N!xz?gr=ZR0ILBT23YE
zHWPK15T?Vek95*)^*`V?@zhF19nPX*vXGUh#UQEu9@#2M?QuxWI>>b3#khu&@yEW6
z3DL=eXYdzT#mR%`@I=?igA?j$<H>`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v?
z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ
zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+
ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@R<Q0}UjAjkNdk
zA!0k0PJ(zOw&Rg<iS2l-#3QjCj|Luz?Rc!mBe5NiaXePa$96mt+mYLeM`Al3H{y}l
zj>m31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM
zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2
zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw
zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB
zB@U<}c$AY9n<RT@D!QL#hsJIGDnd~0cBhQ{inUwEqvm@mo%;43*|##;w;1(pzU<p8
z?!NtBTW9RsRRsU1&_mfbC@}>>o!o&=<Y8@Hz4}A1xiiKuU|R`(;^f3jauHFL7f7SP
zson}69{(#zY9IE9A%>l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV
z@<mAbW4TtZDhcl5Hr1kgFi|~z1hEi4i2!HdaAD~eZ3>wyUjmH7l^J`D+F!tdCT0?x
zS>d5jIy4Cd#cNKaptnN;$G<B%v#EnQvjw+_?ag+(><ehi>)9<fXX;VIQqZpv{5POC
zQi$u^<cm+b$-P7#%9MhXcG-ZX)eH2;ruF9#!SP>8TK^n7IFRCB;x_T|rs=fSIkdWs
z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz<R
zZQ^H%>6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2
zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x<G{}MZ7
zkp0JSoA~b-=^&MFD`Uep4(mth#RPwqgug|?|26gSb_YJP@i1m)QVpYz0gc=mnWV#e
z<V}D48LCbT<`@fisYcic!+&oZDmBb`cYeVpgwub0l6a84#9i-OpXYkCt!dZmJAsi)
zHMq`cFnsshq(ip>|MAOA;u%nlVS^@#&u%<C=PdWF^@nt`)^2Js+o2Dc(S8d$NY33|
z<<qjtr`%PpfTMsJDmoq|9?CkrN?!y-6F_u^-kxJrl)XnmA7W_g5x=R(HL$(FvtFVe
zevdc7Ofu7PTCd0YFU2{w)L6ndLp;atk`jBol_eYli4*_WNQ_n3I4E*qrZ1An$#QBr
zR1uVb@ci#n-3W=>Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8
z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4%
za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8
zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3
z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M
zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>R<QrTuv`q3OcIUYGGm+(
z(3X{=0_Vh;b%dMgV%*fq$%FOyTZ#UhJlKFIn!Ij2+yyl{7)NrWjINPt=hR_>Mo7<G
zJ$$eesV~sC$%y;`G%?PMxa%=y$KJVcmrFtL#GgRG(z*RB;kO9J=BXn1<iu^usq@Uy
zjfW2tVl<;z$?Q1Xs>{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0
z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9
zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});>
zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW
zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE
zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@
z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX
zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A
zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z
zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~R<!yWu_xJ*FUp
zzH<Iyg$^q(2!!p4!`}ja;v+<qVnQLoaVD5TkCPMUAQAK(gLeM#z90EEgX813zeuC|
zIK(u*<=f5{^Dl@G9J*{dLt8c6^_oj;A@AKZ@mICbZu&09Wp!u?>luf9XF8b;hei=W
zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L<j03)X
zvK6edo;fu7BE7MPpZvwU9&eScZ6+4C{yWA4TRZO|r1;TLso)hHCF2Wb;0~=cM(|*b
z#du)}EEzx5KR!CqKfe83KZ3Xd4GjN%7mB(jai`*!a6g0lFE$?Df-fAt$RE3u{OSS8
zg>$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi
zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez
zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg
zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi
zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67<r?xngL@EZjkZ;akq~kv
zJ#@|Ccj~Xcye=Fmrjjx|MLj4EaTXy6H$JhagldQDPjM-ZUmo3rw}{?yc*C8qzPvsh
zskkrP_?1237oP8LwCLS5ergkbH1Dnn?<V+spy~7i&#e=iR(|Ey2<k+=_W^{^MPK;r
z#+Cmpn?*8YmLj?7@W}7N@0|+|x9RXcZS(jyREOUiMpl;k;r&yb;zZX!gT8$NY1OwU
z_WU0v9{sa`<nJ?IzwT}8#;53KtT!L7`&CDc*uL@bGXLc@V%w(iUu_!y#^xpT%cH@K
zPyD-Q;t0kt{1$cANPO4o&*t&xH;?~n129Zg;Ai}v#P=&-7=0er6qG+-yzWodC7(R%
z^#5g-GSlzFZanOJD;X@53_FIA#IJsQ160?6PkaqoS0RgpNOS=6lO$1fY4`&7&h${)
z9Ox67y_sx!x9A&9^@!1-n}>#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd
zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9
zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U!
zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4
zTh3S5dcMN8^A)Z*Ut#<C3Rliv*qltn-KEW$^etTieCrt;Ob;gehfSlu2jWYKRHo<c
z$=-C&ZshGwo1)ts-f8yiHZs{BGn-Bsy?c%AnY0Og5F65a2Kv&*HKV(G(`E<d7>3Fl
z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y<r_bp-gzK;HmfsWyhogMo+dOEV1TT;8k
zvOc4KuxF=Xrf(S?Fw+^M$J{x(J3W*Ybve;Tif0TBXARJvNgFq(_Zt0B8#GlUmJQ5S
zbJ_5$B|B#=>7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt
zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*<qu1U?@fRzF`yg
zni9Rkqxl(qW*RtwA9VDf2(%cxQll9c`vCztq;n|16Z8beKbRcqP-;ZzGkb^ncA3LN
z1N+i~wTCbc_d<F0Fva4^fuVuSF6h*)1KC}c7ZG*4MyaG{&{*Hy*w~0R)BXJ*)2Q1i
zZZ!wMv>~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{
zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3
z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy<HaFI5cde()~T7gITJOGBWV@
z$mt#3Y3v4Y&rX;hOENP!yb};G2{;KdZi8Hg(pRv46e23XHUPmugV{uHRIM1-c5S_O
zL)W#*^_#a|adq<Qu6Hz1!^2A3^I_Lt>3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q|
z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv
zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os
zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK
z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwH<H2~^==*y6iB6F3SkXT^
z)R!GL!J_=$s7`iur6fJ;^65!5!3p*yl?`_HZ%b}Vy2A%cK8d(9sM+Q5B;<kryZZ+$
zIIDkcGHhx~rL!0q6LAcfqr|wLK{<9x_?_u&a(7QInc39?wk3D-SeG10n@Q=SIvlS{
z(aKfntQ;U^ZGjl=HP;8Ifu;=E6zr3SRWv2Np~R?$88~ow6^!i##w?&=yU7hVwYIT_
zXQc1Vq(|UhG1B&-v_V=D+^yT6!GR8?;XdIGQa!02qZcNRw&Bz0a{Lv0y_tdX;9U5W
zSu2#Y;8Q@dSyy|zd(4~F7^H@UzQHsJqW}-F&oy2t*c&C<n;1BczUg>LOECuIRY|3L
z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ
zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?)
zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D|
zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u
zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC(
zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r
z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP
zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4
z^^d9gBNn}dgsF4^K6@%}4Yw<KZePd+ryWr-Gaurr{iDfFZmGLR-aA$L%%8~gq`Dg_
z|HMw2&z11!sKPU*%6BL{-70-r>3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M
zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4
zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS
zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP
z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi
z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ
zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H
zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta2<cnZtW$YOD|fO5K+ooK_L-Vx}=;
z5j9m%MnQj)W6};@T2Y-%V<A{xs#Q0f)TvZj>0p5Z=GT*pDOL--(nA)-%6cueq44L!
z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE<i`LR3xHA-b@%P+8QPgP6uex6a?zvk+SQmv
zrUY9}%jEg(GaN6KCBgsIunc33aVtG1Y3f25ilG;*w~87uF}WNVa#cG|dwa8K)gYS7
z>{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y<M^<?x7`NYq4q|!|Ej2=HhE1{Iq5*
z_ooU9;S2k4#pbScZat#OAFoQJp%AkY<LF%;d;xNDuF`8c{FwoMW(Fzx*>1>R&SG8`
z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&>
z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x<weS39!bw7%==yVYH*A8vmv33J%*8X@d{
z=g~1!1*-u?x2<oI@<N_ECU8D@ha_oY)P{m*{ZZ>2q!NbAud&FO4K9mam7WxPHtAqz
z#8=Qkw*o78Y|FR<GnS0opQ-YKeiI^PY590&fOERcgkBKdwy?Z@<<Y?jj&&U5GvRgM
z9Ha^kt9+@XLMjJNqh1A0B+~|CTi11OTeo?M#edSv4mDO9^)zq}AimtTaZ8fczn$^A
z0FENYJg^Jn+kTD0nAiVV@mk=jUp(;55N2ELLZ5{w`qtuA|Ld7&R?8Ly9V+9?O23hJ
zLi|SHL;nVb?J!pLRz|7T5W&qTxPoxbTBBtBWZRN(Zug;ZI$1N|rCo9-7Bf?~nptDD
zacN!OrIx`;yb?~CpPcNlXXEkD)&2}{t_Xzy&ySCsfNWX9_3cU{exuEPIOQZqA+*$J
zsJ=N-j76@(&mo4>F^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0
zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx
z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi
zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR<eE8C
zu)40#QUqC__2B%B$m`!M@QNX;v1ag^@Z0RT-U3&0n~wkbb=O|A>6)u*a4ds_b0cO5
zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+
zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;)
z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEc<rro6}t|
za(ai<xerTWC0#Srw|x}ZaiO+Z`gXp0Gy68#o^n>sav^{*99hPVLCFo5M+(ia+1be+
z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El
zi(Uyw+PsFhfO~uTZoU<vDD4x<6@j{@l&EXLzl^c{x+_H8JC@b$URIY{R=2UEZc9hq
zHlywaWubJNT)Xkp|L~_TeQO`yt#z#vQ^Ix0=<bb)!(3?j@r{b_Xwx-`vo%<u#a|wC
z7{8TI=YDA0x&xcxq#b*4@P#vIX$m!#t&zu<)OI~D<8n5w^13rs2W-4fyf@>N+=g>S
z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^<hD&Wbh(-1
zuvKxvMTd-~9lBC-<<SOI4-P$&{j-)ke9Z%~>ocE7l6l2W`!0UFA@b|9FJnGiFm|~+
zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*<o=o4sKvL--&#9$X$_()tBb>Ia
zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+
zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}<kEN^<jpU;3U*(n%&%P<xtH)8%)wIJy
z9DpOn1e=ceA!~`X=>opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@<kd9BWH
z;GCM&!jaLz3?eu1pD6GC$#%R75fPKL>4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe
ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96
z$I+*uR8BMw?P(ks>L2C@Gd<YY(?<sp-v9)Xuvz{#A>YHhllcHPZ*)BvDO$b=r9lZ1
zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go
zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv
zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG!
ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+
z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz
z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k
z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM
z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj
z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk
zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^
zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E&
z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$
zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0
zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A
zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5
zJ?^Zo7Cx+!Y>iLT6E~R1pC<qQGwJ*tsQlCxBt2q5ekM}%?Ld#Ho0k71ax~=A{u*7@
z{U`ifM|*vIS^0iE5WKY)XXP(e`PTlKmEWoItvyRCpMGGT@RStb->34eJv$5j|5o{-
z0{FjE`6}C{|5=r<j$hdMKUMkk9xYu+@>PrtPol@#pSSXtsC;YB)XHCp43DT)wVm50
zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS
z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv<a@-t0%#$=&-uo*{3Zz8BQBYi
z|1{@YyUb2;X(_?iI2r2cGn`*FE&penzi3+ib!CKS{<Qq}asJ|I`G3#(mrl$7Dd%4_
zEx$NS_${q+iZADUR4TY=A3YXv>~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3c<T$2S
zcpUtXMGS=%XA4{8<G+E=v53RZF|<e50QjavD>KyJZ&H55#ZT4lZRk=g;_z)ub)C)=
z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCN<b>J&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ(
zb|)1-rnUF5gm}H}ov|8H_YuX<Y3-52#`SvJ>$V)0L?_NDA<On6$Yp*S-bW-ysl9du
zOV4zX{O*C|3dI+s%(>(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq
zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y!
zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7<p8?;1
zyelam@bpX<!Q0P(x1RyO6y@ZB3EmnR2;P1My!{Lqf~SjM=-HrWz}U~gc^UFbC?D|j
zOc%l1&w#g|0gqP?@!@(2PtSA_y!{M#`x)@0!z3EQZ|O6^+s}ZvpWD2MX#l&z%V!Ne
z7flK80<V-oL7BHG$Wg)$%@Cd{LwM&=FDNF01SV_jG-pos2#KpiOrn8K*upXV^O6@p
zn%d-c7fL1a&d?S`3_G++v{h;2rt_B5+gfM0b(D~z9kPXKZOzaIjnX@`dbCk*x8W5=
zUK^f*{^2}eUb#jm;HLGzM$zPDf9gF?KWW6?p*4Vp2Ac*SNoX=^r{5@=G$^8!OGI=h
zAQSwHAkc`zH=d71a0E0)Kx4$F>Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob
z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL<Dm|$tTvqvMmA1+a
z@Sm;@{3F?0aRtHCnddm!CN6@fbVR{ZIr(VP5x&%&;N1Bn1G=a<j(^>*G>r*oR6V*;
zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?<
z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|<iLLs4(PP8
zMGHM){5u^MTlQKV8}o?g!{REWCuvc1>MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$
zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW
zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q
z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U(
z0C?~A4A(|E)_+hR_<Xc_HL`>81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF
zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW
z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf
z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@y<N`_Y?V!{tJ(|N$Ee?hlyvx
zNBWL*Am?Xz%98r6%2gWrE%A|lyX%uae|x(A0)L2>XjngCUsbsv>!Y9aQP!Wr$|Zgf
zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf<Z(C&BO
zo!pWm?9bkFm+%53^oi^zO8s}}n}MD!irtp}N6e6-@8_^b_?z2+-=YuxsTTG`{Pg&-
z;{f`j`v4C8n+lFediHrjM2{$egN1*4Pe$5@q^CMxf1yu#_!l*z{{g4}us??$@?T+Z
z#0==vzo|2zkNuZL5Bsl(Ll5`^`f5#o0li?4MKAD2ZF<2!r03>>zQ`W+zV*3k?_BsN
zO7q3=z5w;lPyGWwsedeA;`aon*<Hx~{=YxITxh(SZhvZluMRzg{ss4fyOm$+<P9I#
zUu`zX=ELI;>1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X
z0qAXn{Rp-n57`gKyQOB3i<rmIk0|V!D4uFR!ul4&_%cTHoWOVreOPMx{RGR)rst|L
z{&+pqzFs8D?eVq-nY9>i-Qx%388u6IyqSty_<bcOEc{=g@vPKK_<V#9>8hNU=`B_E
z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P
z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz=
z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@<RS6SU!MXY8Lrv=p>p%
zDuVv|F|gyJ{==Wt=0q7RMTB)pKk<j~mryzQxi}YKeJZ9p@Tavo9~?gTP4JK}+7E2s
z4xh9h{*>%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw
zAO(YYOaD>A<t#7Aqc&UW=sWz|2I%uqf~DOH;42*eqZn^kK9v%GZ8k8)X=)GhsoR6`
z2;*6R`UiXmGG&t8KgN#LgJh3kfpw{MY>zZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN
z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ<O*(#Q1BXInaYx!#*LDrX`(8que
z*T#Dx|44-7OaBCwn<YWD#ewohDrflv31TePr&3#=o?Z@ke+k#6a?+=LP(SY&@8xnD
z_-*@+Erh=Kxqr0RA-+;g`r)_rgT`yfdyLBHuS?z~Zh1qU9C?RW-lbH|@-A`stK@I<
z(~>_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2
zN6FWKKV<SmC&sI-!T^8n`t_o<g3^e$iC<$^ef27~r>}&CFGF>4B@FhD{)`F`Dkx3-
z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hd<Cinh(&PAJh9&
zwr`*wyF+{p{)OasKl)>pll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO&
zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO
z1<Mb8$rTrq`LTZje<HacIb5Xwjr9%jS}`fy$hGJ{$2a)JUZlhRdo_Xatj;Vcp;L-X
z-&eOHK8muvOr7~9`;*B2NGanxcSnf)%X=YDwv8aH>Ku0VbJ91<@w-}FRRv9;{?dFG
z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>L<SGEMWKg3L26dycA{y6GGg)ytYgrDIh
zeu_`a?Eb3yB(bUIP#<Z~AB|KZgXsTP4f2=tN0q<4Yp<e)$}xYrs|wEe6|jub6G-pH
zNP+>C(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@
zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX
zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9<CWt=RM~vn>B(f8d
zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$=
zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1
zKO_FD`ALZSKZC!G<iZX=(Er+O$<+V+#AT~3doT2-wOKK>ZQYhft@2n-%9r^Cc^YO!
ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a
zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl
z!wHWOt~7@<oR-~VA42JNN{>CxS^#=^J%<!p$%DodJi~uG^JTaHuF4fT{^Tt5HHUZy
zreXEB(4SZ3Vs7|p{(OF}2q&i0Mf_9pxBP!BH;20F_;<6M_1kJMk}EB;{T1^+bOHZg
z<oJKUSLG@=T}u9e>c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R
zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}<Q3UpF6Rg5und7TjR!32*~SA5_wMl^mW#!z
z2|xJ<&=c7g&hzIOPb%F08uZ5U=vDsOiuciexZLvRcnRRHy(+%2{C5c2o9Dj~?YjMU
z&=)g^9vUB>BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz
z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l<I(x><Xdk4O#
z8Its=`6B2qQvAbw5$!FK@l%NWCE6qZ&;Hc*|4P4X|L@W_=wqakmsh2Jk^e`33BT?C
zE15I!=UBf1J&r#gVSip5lltbP{=k37GMIx<ee&<{&-MGt%JThpne_j*|1JUm`){x1
zzq!8czl$9IJ)^%a<M}p^U+}kedoW+cdKAaU;Ah<)pm(JYWH|hh_Jw%hKqktrj_vJi
z8I5<+-W+{V<D2xqh?k3aex>|5<e|#deA7K1Kt91-K>1_vgvPgu@_c_=A^hZT$tKA^
z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw<ah%4pxlC&^J7Z;18jeU1pKhd86V`0
z<Eglmzb5_7VSfOss9W(0=A(?C<jefj{j|Se@z*e;?D~0pJ#_^5OG^nQjY<OZL2ss1
z`4fZZ3$z}_{)N}Ws=@!T!+(|sFRw`XKps5*-QjL87?%FLljkcK&v^9Yyl0M}y^xGQ
zgLZp<_8`38IemRGmMbE1I@NfM@ty*h$)x4~W4S6+AG^aYH|OS!4_FT-b?8+452zn=
z=EIN&;stUCxS-WYTk*ezKaiUTZ#P*D0O&uhe`(^VHC`eO{D{EE)-ZnXzut_L*l+z{
zz8}bio%*nkV5VHg^W;Nee@nAPXn1mMjHuxCmkQQj)?uu_)S5-yp8(01*27H17Xan>
z2<<I3D`FK$lg|^UX#QPRBn5^vruy|}tc=R}V*F(hg2m{g+KW{XJl9A0Wy=5ad~)GJ
zjt@?df|0#HkN6h-otnC1%}A%RCpBNP_=Wz@Wn5GR>vOLDvweY{m`&eCqOVL1sDS77
zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN
z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr
z<=?Ml{i!u$dHPA~g(M1|zrx<J-opJO^2wg-%_1JpI1PC&HDlQLQT><vD6`WjC;8>`
zBkb@O@R3}M=Qnac5%YmuK&B<V-bgNj4&kEsh4~}Luc-fCwhzcZK=h8N{X?wxV}QYR
z?heE=&~Gyqw&nj6rOTr#Lg4vk5bHmVe5gDcP!5XAmzrgcd|AJXcz!J9&HXEbZMftu
z^_ls@@(xGYS5pS)r}@TdE|l{PTfa|3ehbT3zaOy5q2IwwSo%Y<PuOpsKfw5a`aIti
z>!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{
zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q
zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1<GYsr~LD>THj)#3xD*Gm}I
z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^<x-8xxQb=#0nQwiR69M!uK4e
z*}!)x_TNAGiBDWMMdKy>?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z#
zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV
zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF
zk^H|8>sb`8Grz&Vqh`<xipSQTk^Dq^JfUR%bGh8FS$pQ^%#T>E&BOaU<WI0)@&Pjx
zq*!Kc8Nvdzhw}y;uXxma3-~DVw&i($jw5jq(j~v^fR8xVM*52SM@@f_<xlN#`+*QS
z<hAGSVEIJ%d)qjkirmTN(fz(QYJWw=5pMs{eJG{<P(p(Gbvb`I*_W5_!#)pWDtL6~
zc$4FuxpD)B*8e8FD4)aeo}B-cF+G&Q{pb4g<N`0jpY@_V5aRyZ^hFBji#qhd|JUVW
zOi#q7XMr?-!dFl9=@Ay7oxWJQTRZKE+2xB!|HzTyvh{0zLE0!tFLKi0&nnYfLGeQ+
zQ8BU=>B<V4uNs`@^0^f6csYGUNBLY@f0XcCzJT8ilkgstFQB(_UFA3+W7RK4{fJKb
z!{xoj?Ej9D!yx<m4)n)5Z$nw+Z_LsHeTTMK`u-HH9~ZIzA#k$ikC{Q<-wG=G)pPUm
z?BV`gG3#SM>EEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH
z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&<F24Vx6w^|IvNGX6bKXf6)Gc%w>-LF7lv#
zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC
z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj
z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM<O_dyAhXEDAEMVK
zZ^BRf0!56U>8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq
z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7<d$AIj6uqgXGb8Mnhfp2<r3
znZDwn(;s|^B3Hrkwf2XPn#BeB7%ZX@*Cp>_TmP7UUO9bY{w#((=NHghRN=Ho_I8m&
zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^J<EX~>TAoB%I6f&
z<8k`K_JHY=4UaFI<mNIx6+{oyACS{orl0KJH`fiXn<aVv*O!yz$?>Lont%4?Wb?9o
zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+
zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s
zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{
z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A
zsWt5K=Oz6l5EuP!dYFD<sEdAw9;SaWM+b~k(N7GP@*#fU(2I3R`W^bDd<?5tF#R=%
zxYecTCx$xo8!DherlOx1>Z0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO
zB>fIOOn;^H_rM|bqhtXw)S<tMB42qa`iY?~`rY&}{lqXyKk2)QHwgpNUqi#Yyjb5V
z<{L;k^p`vIF#U!--A8|*Kl9yb;HNL-sH?Y(>~R4x)S*A-(9i8hF&d~#*#j|{+wUiS
z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm
zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f
z4u9nOECaAUl*CA2vj5f8e`dJT{}@eY<fZyg43YiciuTKWR<URx`YI}kAzXiCImwed
zDDhte{6TzmL6yI=xG=ugr_W~rZN=kHQGAa%fYl)E2l2a)<2SWFK=D4MIo{{;A}=YB
zwSI7au7v0J#cF<!_@IdRV~xKbFiT{7K=W6wALjEPPW=d{OK83T{$l>ClfbO`0nfkx
z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb
zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89?
z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA
zDVh(@<uu15Tz{dW?@p%gK*sOHGnY{ORjt~W^ZmNqe6C-q;;RFh8XDeJyy!!J_;iXZ
ze)Lv4@r?$3DtG2XXy2EMb9HAv-ISNc{7CwD6`$Roi#hYjr)Yix?ppc_e3<Xy(1fA~
z@o-&k5!1Vn=TB5_EU?S#iC%X+M)QTJE54+3`64FBj@M#~oca{M=}x>?UyiiLiRU<e
z_T|N=h4I?a9JZ&SQEVT~4_Ew#`NUDPVlBzn(wD;Z2dv-ldbLwulJD?Svd>A==!%Qt
zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j
z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz
zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!<v>-b;Rg{<_>MhHqE;kND7u
z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ
z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS
zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L
z<wyMrFO_qDCaJ%bx`8y)HyI9r9xtVtp67`kO5w8Usnv-A-2No-$A}O;2J?&XHR!|u
zruRjnPw%wtx7Gvt3@_DZ__KsxiHGn*-mvF2Hhg^m>GBSM%l<tm^yvQ1HX09|^^<jN
z<Um;;dA_&46G;ir@~n>#|LSdiMcVE1dXEnOv&u;m{q<g=-+}jaIP_?M_qID}@<;7X
z{}wHWzIL#_c96bee;`laPaOe#tJ9weIv=pokq^@O`bGL3ThH{-pWZT!^pp6<@@4*4
zL@3Sj1V1}WpA)}aM)<)$>O1+<S|9PB_fHMVn_Le0)%vNwjy}a4{<FL){e)2J2g!Q@
zAMa)TVEyy0vhmkz(EoPHU(})X&7--Ba(lf9{;$JqC2`2>>=F2*IgY=T{-T1$Z}t}$
zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^
z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I
zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4<YFY*s0
zAjdx}V0&eGPllm?1}~Mf{;+*V9DP0)1pi(3^WsHl-{;s1?3esK#oN|+K>mu_Q~nC@
zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl
zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s
z^^QG}y}0bT_8Q3NJofnFA=G!-<HI!n${(-t<h_gh&E?LwMkdOj4;@mT)JM>NG>7^S
z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv
zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr
zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq
zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb
zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B
zz0mmVbL<iAMfb0F;%$t_(fwEC#m9N$(K)iOILBKNve6e0A$}@%@IS=sr>rmRA5+D!
z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M<qXU^1hrjG^RyzKc(=n%g)F=Ny
z{FVAk<NE^EU*=ySzb6X)1H~Jz@p%{dgO(Pclh=dylKeRSllE7Qw88vfeLlMs<FRWz
z!Fo)G>2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e
zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+
z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j
zbP76OY?%6-ISh|9)~jmF1q<v64A;N-;$>WpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ
z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8&
z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8
zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{
zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g
zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&<wJRsE=`^M=`!LFYrN89wGJ
z*YW)fZjUzUK%aeHh|d#k#rc<jpX0AD%JVfy;{|k#X9nWsXNW)ZB=3nn#Ao>420q>(
z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi
z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjko<s;
z&MS>s_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS
z!&F{gAx@|?<DXk0&RA(GuaxUu+#a<*zk**Nw$lqL#8CxbPvr|M_`o2QFCu^B^HJCM
zg8c8^Yg9l|F8UW$@JBBhz6|hz01bV7;eFV<_tsX3dyt}X?$0F^;*?5bJgdR^Y<t1u
z>8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n
zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX&
zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L
z&_2i>9<cP4^p~8gmB#qX`g;cWh<_6#e_C&b2cmSn(qBWRS^uEFohr@phW_qwmqUNY
zRGQ@r{e8e)4*fl0rIr3ps`gnwp}%oe&S~f`UiFal5WUdfV=6tH{!Uo<2tV`}>%LMy
zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5<dj|a{{hUpI4fMyY
zzdQ2N<)jZ({vXlbl{4tCtxrmS&nSM@EB&R>oAsOY2l{KMG|L<M+vzTc{_ap|mM`>o
z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D
zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@
zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ
zI}QDfD|q(D(BBc2o=tzTt|;|i>F<O}FPcSvaZbzW5A=7JO3$Xh4=H?>{+?6m+4Q$o
z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=<mh1L9eWf;)8!y@55Ch6I;LU`QPe$
zSwBV;i0^mN`_*F%52XUUGk?MRC-^=_;rF|oc%90Zi|Cy`p3n09<23YtT&-ua|HSv+
z?#geE@S}I~eTy0J$M#QEcu_v}>KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9
zckiX+fPg1{u>E<b#0HgSee_L<dsKRY^dGPLKBLlXKlpwS?T|1(sT|+0y3R^3L>lk6
zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6
zKc#%WM3x^xIr<aFjE>r4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$=
z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_<rJU$
zltL5y5v2XRf2HtgG=Bg$<i+>`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT
z`z6-<dU@YF!+fKPz9&Lg4gO(&8T0L+fyZK=Z)3l4k5{Mtcb;#fe?0$wyOYNL3X~Ps
zzhIUGea8MQnVU>Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx
z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX
zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT
z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7
z*7;#-1oEWyBEBe%^`<K@pOe=KeqZ&C{yyw<{ZrOQ`UL*xeIFF|ZNINY`pNp{_ZWB#
zk-p)3S5^348p(^lHwpR=><#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d
zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT
z)c1U<av?ftfCWDMWqjT|e17ZuW)+6C$DGG^8p}&+{mNUFD`xsE{kb&f_1$Fz@AMKs
zz^|xT93%^!`h=pF*2AE%%rD|6+9!T_sekA%t<Q1)-j4d54oP~yqv~h9&inBwhrJim
z{M&wi9{Tu%e4mf`<INQFP$+_+ALBWF-+}Sd`XAQk;15FNADCXqi~JpX6zhHYz4Ci;
zmi*vPLUdk_;lVHZeiGTA{J!W`{vH!sC^drhKz{~zx`=<!7kWRRFQNzUi=#dMKH+TN
z<NW{h`^V6K`rah<hy5?}+pB|enWpm`^!*3;Q=G?C?-wY4O8d7DmO=mg&ik5dKl%O|
z@eTVIUhl;BJ@I`9^}Q|lPw*G(L;QUv8P9DM)8e)J)$j)zcs=rdPqFC4Go|YhPuw3U
zw%7M|Q5vroQirG<^wr#_*1nOX_=(PQvi*`if?g`;@sG<9znLCB;LG->%E_MiI=7Yh
z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6
zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw%
zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~
z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8
zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(!
ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h
z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK
zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+-
z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{
zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M
z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O
za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk
zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE
za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z%
zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H
zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN
zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I*
zc%Mv<DZx;EoFl3nD*(Gs$-j{PvG*%Ls)&njP|3&`C0y4+CI4ykACW*~O5vzJd|BC;
zt1dl0jDN&`)eL3H>x1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^
z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M<j4LG>&@OL^|A`&}|+-QD$A
z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d
zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w
z`i%O<ksh;Wx6#<!++<Kr(oARf^bE$A7<)#{fuU@FQGHRZsBLSRE27DAu{IfRxL5?c
z6GcmkE`JCgHg4QGINaMaXf*Z=4(uE<62R1MByYH;ExXGc&1Bm$do$Ve?zU7q+cPkj
zh-cEbjHZXO13iPyc<j0{ant2jh|(Xn_l>4{l6|AS%^;@<A0^cjO(TOned%4pgDLc`
zDVb;!i>jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D
zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y
zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55
zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx
z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~
zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7Rb<j=hD(%&wk|!t;>Vvq!7C
z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH&
z_<<&X?;hUMIQj$evIcI{^+5qrjk(K<GP?$UDHgRhED(RydU;Xbo@`HJcKE2z`>2St
z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6%
zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2}
zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjU<d-v&UG!KAvp)
zj;5FC+FM1UuD|JPBG!CT!xukd&GW_jrEM)YeN^ZkEz&ZE#R^f=-(+Z^79oDCb)w0o
zUAnkl)0+`6YdF-quoX`?>Y`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z
z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K
zbet&rIezk7%wPFwt>pcBH1Tn<pyjYG(wg_LMNJ}cRu|Z_epp082X*a)C{JM3^MtNt
zMKrZlG^Y|xReE&jmwEzxNeH8<p-JxoQ<^IE>K6Q<bads1J>kV(p<kvK&Gkg%OZ7<V
zN?m*pO>5dWMWnw;YwAzLzktoAp?*!=rZ<J^%o-uSBm6yDH1UvtRf;}MT%qe*kjC$)
zV9>7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA<aI$5*at*NbS=K8;9fF4wBt
z%HiBUBEl7#XYJBN+dNIv*LzClY2ujBHjAwgdvm3xi*I@({eLF{m6|B=X!AvJr50U*
zuXHwHi765H`aY@05?|KEodIpV$MYG@TdDOfO*A#Nw=dMZn>-8J7HZ;?;;kmM4>@yM
zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp
z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt
zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{<nTn|4)VfW$of-L(`uW=<7<esm0KuZ7rx0
z0s{To=C*b?2u&>YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE
z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9
zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA
zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr`
zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9|
zL~v11|9T+2IOy{{<QHwF;zqCU8lk=9gYbgi72;99?_%M3msmVFe7}e$eoY}ueM|e_
z3eE6|j|Gv^vjN|UU^F4L)3DxUiRSjUrg&TPiXc9&S)~00l|q{55)r&c)3$rHgkSXe
z#ch5apXA(Gs)H||4ER5!X@|U?n|w<X?eD>>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG
zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R
zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I
zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENO<Q@M^ktsVHy0E)<9rY2Ogud+@?)k@i)%8SJY1
zOZ{p8j?s1~O!MCb#AL}^F`)GJUs@#QlKilI6pepN==fcO&xOPfL^R|tjRW)*UrXpV
zV<86Awnaj;`^3*Yad`HCKN1QTmEKxf)7&zzRJ&T&|3>KBJkiuYYD-cpO@B?I?XTea
z;zRz%OyWyIUtd!4C87OBh|fjzw-<Z=M);7q)hoUd4C4W74~zPnz9fPl7vlTEyVw{0
zxbVc_Mwd5TUKR*NR~)FQOPj@ki{r)64<W+AfG@lhvE9QJJ25&B4);A&5o`NcMKn2A
z97v|pBg5II8$TA(f9j9)en?xotvf6#Hb;c_gORx{ulmD@fNyT=V)zDKzb~Tq!-e*X
z`u^1R0KVhvzapUhE{p^FS8way-0*3kJ*&lfzAe0UBHC~nG))g}2<W#3!nIl;oC^fP
zEudmF5MC4TXn*bz)dB5sq5VpT79Ah2720Jmt$@f;!UHCUZw>gh`CjeTK=F?w+I61N
zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW
z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP<r&3$Mdy_*dv<b{c%nBzFX!6+uAk!CRK2;
zs2w&t%f$zURvC?qe5!bE&GV&^ra94DFbFq}4h>*iFtI(d4k2Pk%%}f1zt*UUPiPuG
ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N
z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ
z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?<dPryg0k
zE{wSHwo7Szy}i>Traa=`eLW2Z1+Ux1oSsjTor<IfBcXVq!rMMaJWzZE0_%|`UmVfi
zLZN@7LVwPyE%xf~*J!!<-QJqejm2V7eCcj~B=rMPipXPLu&hN?BwED0_GZzHfG{(^
z2N77$;9$RAj?Y89bxDI!v?t*2?Z>^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j
z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>Uo<MSagy2tsZU{b4&lU
zG&~|I7Bq=6BrB?$+ePiXmUgkcxoJeqt8RT_o|u~`4tTbdUDDG2jd|kJy4YHFNqc)D
zp#MORuWDP-T~;0n%=PYC;CpIb*rSym>uSx6?%s_UKi<?kC%j24n=`OU_&%Z|(h_%s
zv>!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8
z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv
z!2c`}NueW<{%MK-x24*KfEXwXKQ<Sk7Pi~2D=T_Osi&k&uP+XKM(98AN83ikg4XN&
z+E%~VTozgRr+(jG2>tVTqt2)OKkU7CbX3(IKl<JKlsPk#%$bs2$RvbN1DTdgs0k%>
zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?Ojw<Y{~Dl&zu>Oklc6Q
zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln
zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx
zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9
zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP
zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`=
zK<<deS#pTHkrIjHvrK%><j%+^tf)_<<T^0I*HKv{>^rMK(~DTP{vGAG&wK~}Xrknd
zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n
z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L
zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3
zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wR<waKVAU6JEo
zf26b|$k`yQ{S6{Lo4NG&sX+e+&$eltl`}RXk&3vk@qAeN3B%`3^nz`@>E=WVFjSzg
zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr)
zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB<M11*_kjvuB-2ZFTNbgGm^4<Vy;=Hl
zHP7ukJ=N`-oa*$2#NjM9Tz2|COo58(G0gJ$|B#Xbs_ODznWqou{df_Cd*Rh!psx_)
z4a^<5km$8!N+#ub*$#gW=b4yhm_zP}9X>6Qy1ru~=%J0|`e4?Z0N<c5IFi%ZO#hN+
z+WH|Us#1-t_v3wOZeN;^z~{)cpf7fUv7J2E7rdCW2m1<_AH`7PHzg<7st(Na1^<Qq
zz7_e_){ov!;1`lBG+J@`zBeQ<q~y@Q46pS=6r!(sXCD0n-uqbY`UWCC{~b2?ET8%r
zKGWageQX0beNIxN><oRGuHCCqK$4$J9O4f(%6(j+YBY7B6DZFYdWTU(BK*l0%G{Oc
z_PG=L_=<<9`tJ#Y+`f}Qn2UE_VxF%U$(sgsI^GNjm3(1%t^%HWJc}me)3u1VI;HC?
zJoih~a~08O103PG@E?1V$xX68fEVy9P$O@3IDO0ULjqzi%Xz+%3hpj>obi6bsccnu
zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS
ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG<lC$grH
zjA7>79CeUe8p0AVPMJpz^l-115-}%O&mDtM<X@3c-Yh5d=P#iznWT-(rGz0APSfu#
zV6-Y-xjvWU8og_=d+V7#>Oa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx
z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}a<FqqI@d;5V}QXp?z<ar)6+aQTdJF%5MWEP
zoM&E>OJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j
zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i
z-n|}I<U0u!e3nJ`u^jy{<#^6~mvjqd+0s-tPgQa~IsVbOvpk3Cp=8AsyoB@RHfQiA
z$=TfILme4RQ27We?+;pqh275hTAQAYx;*qN<8?N^#739<Ar?9Z^^=AFLvx1KICbnw
zOvH~2Kf-@dx#|kOrXtM`iO;ucUL@#Cb_Hz~=+Z3ES-C5ChlW9a4$V&1KcGx~C+eSU
ziT-VM%ur~ZNL@N;7vDA}z8A{kdxKD``xKR7DzGxdylA9@L!a$W7fJfAjMN<e$Cwp)
zP0~kGswWWfOEYBB?Ai=jrA&T(8Xd6d3oyGgI#a(PUE+S*sG8znE-CzCWk^l&d^EiO
zVmL>}i!(G<z#ml-=cn_R6=}3e?-f{<C-4syc{^55$g)-CgPe|*Wd4GJ03<;1fg;zF
zt*9^mQBms2o=pr_M$e+e^IY7k%Cjk><SCMLM%VGgY)mt$d^RO4P2{KO(rii!_Sg8g
zjJL}Wg!L?iv7aq2->}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_
z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE;
z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP
ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C
z33S+w4oGh{6xqqDe~SniuF<oyk$Z=0{56ZbUdHJw{hK7!7G(DMHukG05ALSA@iN>r
za-N+($++Dv^e+Fe9=eBlf<yW!g<QXfW#eyA?#L()ocst4&M5ey^GhFC_4}zeM61-T
zOdltux&3K2w|^q=)YM$RZ-wIY56={V7=e}=beF#t<1Xe2TvDKKWqsI{GAv&JSLP;(
z7X$0iemE7?353)D1S}tx_k|nw>vHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3
zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C
zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL
z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1
zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i<xf{~oAGfq4=$%sI`>-m
zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2
z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8<?
zMuM;hxZx0zX10>fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB
zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2
z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)?
zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3!
zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac
z97E0TOG-g8$VVw2$WAbv(KagxP&)tA<?_GA;Xqt*6Da)(@|puTx%vc(5c+i?k;<Eu
zOw{u3$RMXT;=gwUQi+zLP)EZfun5lF+^b_!j}tECYDni?OPV{-h8)x8mfsqVRLN})
zJOEb%=eyi^4R;cZG|r|e=)f6{49m{z(<LZy!~b}W^Gu|{R1ZSbi~erpEH+UVac9l7
zI74Tm+*=YRr0{_@1u1b+u2oNTTHS$^Uar75!p(G#sTiia1CPT4kHZ5#m^s%5k7m0A
z%<T$9`an`B-o$y9*axIrocRI2i1f=62J%CQk3%BT*Y$EovJvT1k>t@AksheCB8Bs5
zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L
z#9vbQR=6bIb)c<t!ZqXqGl>TvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{<
zB@g8Pbh#t{bRlKaGDOObwYnp<usQm|_Rs?w3Sd8xRh?%(Vqr&;;YemQY14T-Qnuag
z4FrQ40%5o|Qg;RZA;Pc$uDt3sYioDtQ(`A4BE~9E7CypzoGZwt)K4-Td)4CQ`F(g<
z4$pEauPSs;7S;AbtIqr`Wti*<T{K=s@1Z}F@~Nd5i*J#Js-$sjzLx@uQbp1d>R;?T
z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR
zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz
z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3
z=`mZ5@K~u+$-{q<CR(6A(WzVlBaR$6&$o++p#2l=D>?19MZM-JxpbKPFXB(Y`cyhu
z5kV+sY-1{YEZN|J?Tp<nCUFrQo+R<}l4ZnNNhv(=JpLhLWy85A^psb+hS&#AT4Uje
zvIqJLU;OA4{tF|1X^|dvt9RJ{Sns}}p>w>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ`
zOZwE_@W&_><CJ97_kqip=$-19jkNH0lS^xg{f~i>K?8FwuOcOsA(ivjk8()vGU+es
z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_
zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j
z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg
zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8
zln!&t6nHwD_abZK6ha>Qj<Ne0`@3pgJeO@Jh9&Hh)#M4%On;y<oHN=I=DB?$Pb7`T
zl63I^$(}`StnqsX210!;Ud0_=pt2v6Xbh$oi17)t+{Jt5CR{no=|4%wyqb4l=^3P*
zO#Dw7>xEeJ$ti}Ir*t^hdrwY0g{ZtYJB5b&ii;LtVZ+07i53u_G~b3r)s!SYf}Ev4
zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu
zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y<rNA$>68i2k)}w)z9{EiU1c5!5Sq2C+L`
zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6
z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<dkYS(M{DiMYIqXBJOn
z><|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf<
zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0
zUs5tj6|2~xY-NbAsFy<QZ8w%{__tExzc7#Oh8HtKOE60&BG^nK3Qxi)k$#w`dl3;w
z*ua4Q0?sB23CxBD0z(~Fle|>+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$-
zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z>
z(OMU>SV9DGq`?-EC?yZUhb?keaJ<CFBnnXr=`FD0pDuO=Wg*Sj_h#nMvq0{jAZ)0S
z_;YsC-Xp~5kQ44@1_~|wM>DncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM*
z&%}|O%Q1CKNFOe*<Awl|>LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}<n
z1P2lKsltT_vcPnwh_|1mtl-a*I~26AbM2#$<X<U<I}1p8tR$=G@e(RS|6ZYcl?tSB
zG0u74p6IxoluuDX7)di(&J13o@v9979HdW19F*DF6gDK}+aq&&-VQ$}JgU*<!lPKn
zr+?cMF=sQzE(&wW4T?zLbxKz8dm2B`%Z%Tpuw-@~lFn7U$4K51mR0<e#vgE*0^5;R
z*9n2l;OEuuz*K|YM*77tcwsT(Ix93^;t$8f1$l^(v_mDn)u1A21E^EzYEhW8iZdiy
zr6av5H4!tqq8gEXEISCoy<9{cCY-;An?E~5qK7btKhVW+$0hQ@5hTZ8IV*%@e8y>d
z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd
zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0<XiX^>IMT1GVbE1245kFbQ
zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ?
zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz
z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4<KOyB?sp?
zGQ$sOuHe;P5lm#Uy%I#Y*T^12?dm(2%6dK^cvq8tvrH+$xej{^>Vu<(0hIWkISeG_
zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ
zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5
zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH
z%5m^9ql!X29yH3aDCp<QiNhsI>y0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps
z<q6DwS+oI)a-g2h#4sNNpXs9PhX*Kh{z7dW_LQhHzL$jgzGPNwMZdxJSTg%dhWc@m
zzF%Ur+d;piIE!yeRVUQZb7^!#kt;aG6%6`IJ|yhfcvRsJ&6Ad-(GH7lOUq;fRprBE
zHcaB<NqT*jTA!SUU5Tn~ITjFYIhD3FhqU=5_6}JebFu;%LVQ+AA8gQM`ToF{ST6ih
zb%wup%4ae<fs}oWrpr`bo*a4#t2SF3k#+r&c5@1SYf;9N<!MF{75=q9&3G?P0}|<;
za{8285c>~GyqK)-rtlkV3X<fcEXN<5Z&SU2k}Riw6y5KyQ}_!66?Wey=o?vo$`PKC
zjxzcF2Fpe?<-Uy!E0|Ilv!qKj)2?S_q~z3PWfU+?(f3vKr4!QKMQ;}Hze@C(9a9+2
zKwBzHlay)?HK=qGOA5T+*vA(bYh_;McoQv=UDsHuekXqX5S-_x7nbU?79{e;nYncX
zhboB|5?kZeFQindGR1*7`9jjalpOMAv>Sa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db
zCQP0!=~*nr75v@cE%O+Xp<e|LvkpR+Q@q=WBDPPk(%G}t#7l_2@#xR-bXN&}lsq0?
zj`5h@ec1-3z8@o}@CZGoEZuH5l>>c6Rwt&<lPF2cDBU~8@~hPud{tHY6TLoK%O=Vc
zV-~u(mrx&8p_X$_qf0b89la5p&wD!w8=V3}(G^RQ^RXSLrk8{X$}$TC(*xdocd$Xw
zWqq2s$#=OhpB^~1w>KEtVujihn3E$Ie$CMITPntJd1%9OXfL<DWT;=vnG^{IeNo0M
zIVcsYlP#kded?hd#4(eL)xY2u9=Jlg`shcoSiLcay=EKY51dhL|4H^^gF<!Xs@l56
zOIll|MP^pctUOZZXm8SoV6WaiR>$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z
zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii
z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K;
zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d
zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValazt<l|_~0QIvz#v@s34!?2(1$}IKS6WJ<R
z#-8%&y0VSf4w?P5(6$X57t&B%sY<6$Wqu~nV+Haw>R0UBMjSstghdsJ5T`|HT<mE0
zd7`=%GkH3{nlOBVnszF1&5kWOzo3>)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{<
zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT
zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@
z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr
ziPjej===WcG~#oI`<la{7Kv$va^WQ2zO;99xIvP93bn4Ot_c=}D#kq7fc;^77?bjd
zjnmi{laeCOHHrkfjhL5y)QTWhRO<Itj1C1+b&nfYKFn5AxAGiZ7ESaxF`P6SNM&O}
zzA)B7a5?Dh<O-~+cUoT_B#vs|)@Z>F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj
z9jyI0g<a@jV@bZB(O*cZorgl|3SxA8>jXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N
z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$t<i2
z!{YDquw%;VV$R}0*aFDyF2)u!`Kwd0KB<mr;;HBo1!<+n=^xuf7;z!eujZ0ylUAtd
zM0e>JX{{}a1S<UgiqbbZzn@z#I-5PEI3xGUV#;DIqd)sQtuuSmUD)=OGDMj@hjR1o
zD2j}(7*kqQF}D1=LDFxz40*RdvA>*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk
zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw
zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx<F;xddGNPf}n
z4OZrHb=`7j@UrDj{FN-vSM-isDrWYniMR=n?_sQc7F~kjGo$$f)fp#P9w)7a*dDRM
zB;B3K*EUdM7SCl$6LyJbv2;oc?{H%hj528}Hh>08g5f~%!6D8S7fN)?H1>{_f&I23
zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ
z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92
zA@^*>SZK@$XU!@L9U<w7LD<ZVqxUxS(m%owpG4xn!uWfr>Nx$dG2urK^;3Bs@$p>#
z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq
zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN
zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3<WL(icx%Sz0`GOmm6Y48p#jM(LW
zNv@WEN!YtgANA&!GI>YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV
zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7
zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%!
zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+
z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--<QfK20|y
zSl(y+JjT{(O$ch~Fx%>(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL
ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq
zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%l<hDUHNbAoiEd
zC)>qL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF
z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78
zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3
zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu
zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V
zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gp<pvtDo#
zXK&Xiii`70>zJF<L5KBf&Z@YmBOaEOWxbI5Mtd#TWE-4gOCEid4IM6$kSo^XHW9|J
zwe<@PJ-HP^cO@triLK-O`Yio78!5MBfjGr&-EO0~3Qwnup?f%g4|mne?0nKf#Lzto
zVQ1tanZIJA;ST8)oBX-T&nqWXS-rB=M626Z<%L1J!|L%ZS5b~+r&S*3bosXTW?|BL
z+3A)<DVwZUC@O{*Id*<B(TxdeuAO#d=wa;Dy~hpU)CWT)NuFfS@rCV)sU>0i5MQ{|
z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R
ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN#
zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~
zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D
zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m
z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g><SFj0
z(2hLDCFxC)uHtm_=8U4(uw{_gYh<(7SyL9OO?nVNmSb$WAA9g{+*(~`m(G$gEsgno
z`~g~8AyXelr*ida;$bevN9f}AvGPjJu;lO+ODV>b$LSV}&CV<IQeViIVrAvo&gNSz
z_W8t;8BUjAJ32kCIak~+;S#Vpvfb|V{i>?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H
zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo&
zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj
zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+
z^E!QFNo%*!DKg5lZ7<p?g_meHW@lKM<aYJ|Ntp9K;IT?phW6y*E%FSqe{ExzCRx9>
zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p
z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P
zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~
zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9
zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|<yUIzy44
zCU>aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u
zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf
z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUg<X`
z{e+5~)9A_k56qM~l=li1)G!-Q&|ln~Ab)wIa_fyhBm}Fmtcd+{OL4%tw!X4zF;34;
zJYi;0r7!qY@B8ZOLBsI`U-2|tg;lYvxx6p^=)%FZNL_t>i8%a>RY{2B@NiA-%DU>>
z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOa<g*jP5Ld=%oJAEP+4^H
zv+I<F7t+&QKY^{05QSqey=-tp)oPr1uJO93PQ_6a->w@_+-#@jr-WYF$P{~@;l(bJ
zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G*
zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3
zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@
zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@
zO<VPe7Ku-=Ico;n9S)1McWru0ejd}EH1lTKKh>@>euGu|$}Od3^S|lxH(uMnag^+%
z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;<oS!gvt$I(P`#)7LzO<&
zA)Rq0-Hx?&lDFHui)$}q(jPhUGpNRH0>K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f
zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj
zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n
z?Hnh90Ev8<m3^F^IFzUOu-SVM&bRg|#9kM5<j;!FS6SKGT-DejPT=@TRy0;MpYEcW
zm63_lCXccv%&{JJ%`B-LH|~U)h>l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv
zs`^z`r?<eq6}6R~?ThNFt6FQvZMEi%-)gPW|D<!}e9d|C?_T|D>r|bw3-qzJ63c8G
zcgVTJeR-E#Ru09<E8ZT|tEJcE2-n}Y&O=_lz<Rk|`p`x!9WnBr4ZRsf)3MONrrGGw
zRg7u0JDrb_s70e+^BF44VP18ZJ;^tOEW_=Xo>xNXg7(u#+L;_CeYH)JUe|I;yrfEf
zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K(
zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD<txhKcSjLC+_5V8~yJ#dt9(
zMu*YQvbl<H#&J#357-ns*=oJn%BC@CIOAWDESLvuiPsTnI$cNlRTxCn+6jHPa&ixS
zlG#tP$xkx<B%6cZnxW6NiPqOX58XDFh{o@`zOF!+>3_u%8pdQZ7`11JDWA`+_NM~4
zHNoc_*U;2@`s7s4xf55Sf?C<n_HF0@E_*Of>)O!!63;zj7NN(Di%UW%vK=0lDGfVC
zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy
z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN
zPT&&Y-Z5?MJAnJfwza<?=;PYjj{*->w6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN
za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v-
za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY
zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;<ic0RI3_+6eyue+7!(mwo}v1FL{#
z!25s;fyo!bJTS5e=7HOR`+z500`tI{OJN?fUVmN&^T5U1U>>*wxDfd3l`sz+b`8t}
zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j
z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D
zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d<rO*1`Ytd
zzy*JSdEnoH3xNao!aQ&(a3}CP;6C8Adte@T8&C`}Uj=%B^X`Lr;1=LQ;Dq~O9%y|4
z=7DcLgz^C#{4mM^a4HZ}Aao(HAMnpdkZ*yvKZf!M9P&h4`xfBPeaQE~&A<b|Lr)>!
z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO
zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm<SjO50?1M|S|fOCPLybJR{`8}8iJ^*|e
zxa@tH2et!`0x$Xi=CPn)IRx{-1;Dw$`+%!~PXTuT8~zRRz|4<f9@zK^%mbaD!aQ~%
zoevBHKm82mfxmtZ^T4%Vz&vovmq<6@fv?)y4+G1-hP)T)2%H8K53;lIurp>5TkT|-
z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ
z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA
zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@<C4n{Rv^oSVt3IhG2}d
zFLSIO>sZSsJ64xFng)S21#){K_o<K@@7Til76?v+;0`v{u@jOzSUJdoE&Q<tswT8K
zXiDObpl=0zNjLO6L0{N~F5>xF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO<q51>#
zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ
z3;cfIe<Aqe5pcc(@lcL<*n@c3%T556gDlu0e$Rva=aB!aAdDCOpAY%*b|brtkGd=D
zy%TcN;NO2n<q*G{%!D{Teuez|LLP1TB|;wl+#tfYUYh7AI>%-NtK6ZDjaqM%4I#$P
zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N
zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR
z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{<ikgsQdxOKZX1lq{l(ne+c#;
zg#8EN_G8|LCItRF`-j8+v5-$2k8+0i7ICs!#K|V9+~M7TBMJT1D-#`Me6!8b4-}+8
zGzKc7G4M59JrCwTpU~ERJL=MC`Y(t7CfMix@B9CY7?_1njJ40kx7;r3Biw95durO+
zXG4uU3Y@&mp^1DX%E4jKGm$6cv!ms!o4m9Txua87Do0kD0`o7!-km5H;vS|BWpi9N
zWplD)8ctw#S2D*$^Q9>3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si
zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf
zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(Er<dA^~FvC&pa=tvt>Y(qYK|xAk#7
z<ac5W_utk-*Ft`8_`j?d5%(`c-mA2=&qF?nwo6kSY2!syqwXzQ>~KyJm5+Cvp*`vU
zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ
z+W$a&$=iD<Z)m@{G#L4|Q;|dCyw|aZqc!Jf&C#yz6m>tIoAJ)V*z>5`*1i>eRFRL(
zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z
ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%<l;v1>FLC0O)}rU0*u
zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg
zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g
zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He
z8*Ib>TfzSf{286hPmP<uAN+L8Vf4g*1N_P0FY07|LEQWi@Hc}$y<<8^Q;b9~!jCzh
zCr)o`_jNR1f6RYD@LvW0p-%C4Z#@2Hfgd>&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7}
zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L
ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0
zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+<z|aeh_?Z
zOV9jS;NJ`W1)a>VkDG4=KjX5V^XFFZPXT{WNArsf<3{<oAN(cYFYL(YC&%r71N=SU
zPwK=U7v~=V|26P?Dt}3s3w#0mm7UBlkDCvI-|O<WcG35Vm;ZB)`EM5ZqruPZ$Uo&6
zzZLv3;FomdHy`6~1%E#HJ^Al`@TdI;{u|&|fG?V#xc}xKv;PSA<G?@Kk<WiL1dQ^J
zInC@VdNv;fKMVYx%+CV<$bT^33jUA(!TeV6U)tKU`}c$YBKSSI{|5LsZNr>W$MDVU
z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~
zk8F4>eI^|<zZLvF;BW58=NH7?zaRYM?QQMLJMp6xw6OmT@FU<a=)|8JH-7~DUEr5>
z<acOalQ5_J4EX1D<nw3-7xrVdhWhU4IefFgUjqKTPUdID-ERf|M(|_vHAX+6szdm}
zzZLw8I@M3-$Lpv2!FOHVbNP7#{C?m++{yj>;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW
z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5Og<TLb<G@P~IazbYCJ!XMXy
zKM(xYuKqZ!vp=4N+$}rX+VAQVUwh*5^)dMVo7&pX@5Eml=gXMG-@dD@9UG&Lg+o%~
z?MXiP>uzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5`
zW=~st-%k9zxIeHEOfO=spu2Pv@%=LROR<J9sG~omzH$4%27eXS7P<?EFrR=5_jd3%
zK^^xW-x$v~gTPP08bnX!ehT=x;P=!n)Pq0nKkzq!UkAPz$H(25o>zqjg#YdYKl%Qi
z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9O<d0Qh}6@)sIx
z!+%r2e+2yb9r=8A+<ZOwN5F6B)Gt{Q@0V->zx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb!
zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW
z<NSK?+rhuM6MsXTzX|-)o<zT~Q~Kd%{aE^g|2)===5#V29XJ^IAN)W61OIFA15dZL
zqx*L(e@!%;H_|U59r+jhJ3I3Et#SVi0{<@X@9yNk+v5J40{*b)y3_;G3Gwn*4}RnR
zp2tO-z&{NBican~#ofOX{Bh6s+^>Hb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#(
z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm<nv8{bryq>|*5URUG%o2z>v2uI0fkIx?
z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w<MlgX550W+dd{<;ABFwKI+Jk2
zeA?UD$O4r4$C$ERr5m{N50#$h%C#!pA}LR+bdfCRU*l*r-L44o<EkLPtO~))Eg-{&
z<Q(9IO!*OKdYJMp4g|0ggApG6F3RzHvhs=rN1K!vEOZZ!R?|0%a-D^aD#{KEJ*|SU
z-lF_wp-U{vQ42j|3BhmeQh%1!G70<L|HiQ0NBWg1P~XOt`?&HL$D{<}^{jJZq4*#E
zF9!Y>1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@*
zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O
zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuT<qW&|Z
zfi%-14CRamOIyZ+F0Jv*O)oPrM&Fwj)yMq3$KW3OXJX9#w)SXv)}}_~@yl)_e6esG
zHualQqI}bu|JUGvnGRhYywXhH-6lS0;(im~GVxOre>PFgkA@?~!~zqGO{_3+j)@Ty
zSDLuN#4AnQZQ_F_?l<u*6F)WaXA{*vru!xqm{@FLg^6=ajF`C6#0@51Y2t1ZA2e~l
ziEo+ssfj<Es1}&+n^<6Cv56HX&M`4!;z|=Yn0Td$yG?x1#Qi3|W#XqM{%oS!*L2^+
z0uzf(tT1toi4hZ5nz+HlD^1*O;)5pcH}NeKKQ-}Z6V-mE`z98cSZrd2iE~Vhn7Go!
z4JKY`;%*ZkG;zO)Z<+Y1i9egD_BY)(vB1P)6Dv%dV`9X_l_qX5@k$eSoA{uK`%Qey
z#7|B9*+g}K>Ar~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0
z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E
z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{
zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK
zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7
zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q
z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ
zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js
z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ
z)(E552_<O_Lil<STumDz)HjLHI!V<^K~s(NvWo;4-p2nD#G`5=DGlniF{%gDo1jTF
zGKZ$%z@f-8djXD(_((azUg-mf=#`Z%&<myWVSJ>V5Ak)8TO^cQ*yvb+55HXKB?tWP
zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@
zyYPSz@f?PVB?=>5OBTEKSp_ray<sOy6i$mLxd9*M6k1UPQX;rk!6hu(23=~AV7i4d
z>9Rv-<B}w?%di~8=!2_(Wvp}e2U%>eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i
z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL
zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr
z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u
zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp
zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8
z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y
zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0
zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`<X?B5F^boK8HC7-d
zfW@miS)WTGCtmfNf+Ya)+H-hcu{XrnA0mPv6OQ>JV;gPjR&Z8;*tJnU9VYp8U^4ZM
zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB
zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r<w0YK|jlNYAa;TYDABV>-IWir%
z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR
z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc
zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r<S&ohI~PMvF?
zXM{5yC%rvFFyq2?3^D107c(zfhdU?IE*lL4S;ByDK4u_W7(i_j6Bl91frg@Hy~o@e
zl|CGU_S}vHoOq<XV;&rZUuNO21A|G3HaenF(TGILFNXUl2Aw%X1)T_}bNU`5^pF$&
zLqtl)l0w7)rqkmw2gj9T{uErbgBD!Nv0w?Vb}Yl;a<FKQGFmF!)FV8`es?6|%TUzB
zOb8pYqNRCAU1N3qikjLXtHZ^W#lgb*y2cf&3l}%87_z9YwS^3k`np9!7FSmvFA|EG
zj>?%<Vf$KBZKF7>UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr
zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^
zlVm3CJZG+<ZZ*`y4Yf8{DwqrXdFOl#{T~!nx5|=a70Ok(txK{Fmsy+E?>w#Fdgz8|
zZ-G&!&4qfOAwNaCURv>p<Xp#b2QaQ)K^1@>g>@70gnM;rt6QUyBYYXF4b$>xc`tm5
z_x4PjeA(Tf9K1{tr<G+^p?xAi9j<DV=Pe#K&o^}5JV6*DKEuRkvb6eoY3cAseNxt3
zp_i(LYm?7|2E7)>5N&mhHAoim&vA-~<e^IZu)A_uB!b^3Q_Zp!_=h!BPCT))V&ar>
zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}?
z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY}
z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWV<NaBfFkwFst|%!
z{Ghv;>X$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv
zV<mdsQCem7lBHC;x(=pV;8DU~)oWXpbnpoh+Z3fpb^S5~V?|>N{={Be<E>s&)eJjD
z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo
zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9
zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv&
zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#*
z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r
zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e;
zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid
zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg
zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y
z#z7NMAsAD}rqH@irl`H>U78A{Z%o<I;H=|(8_Q0Y#tqJnDb=`daa&51oe(D(TUlbR
z3OwGTSuu-tn{xPxUrc#loHQe*d?HSo8&e*0ZidM`4q~C?<hvoJ7IQV?VKgTep+k?6
zqP%0@*dRTYtu<T-w-R?v_PHTO@teXZeKF;aCh3^)F$<%ruJOS!vC;J@HVHK*GFnt;
zDpwsNtu>V~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo!
zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o<D?|?{$_Pcm#DQ{x*T5@>}Z*U
zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH
zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X<T
z(vzqtt2_B*633OhJ`$x+Y#q2GPomWL`fq1?8oe-LQAd)YmMERNTc?z8JI>)z-N~b}
zFxPiGS|(u^UelQF<g;mKxAa~ejmOpa+k9NzQ6Z7Hqc*M<e?I8$k#K`gcaMa-*SdR*
zU#54%ki><{T^<I#Oy9S6A&cu;P#TYqXR&V2k>B=CE$d`NRJ($HxC?!-N$<M<U@Yhi
z*A`wdHX6^uAL5ybmvxiwXMis3kL_;|vf|l*mvxhWZUMc!aD5~UiF_U^H&n5Y=n4q?
z*Pc*jkVN>!qr+pupF2!{Mi<U-u^s3_ev2s|>vkB+R^8AwZu;LWzsA{6*xwg)M#st*
z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1
z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp
zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r
zo<I-w3^VQ7XWFyJl&|PP{=^>WF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($
z7L)EY*RPXI`6o<zkx7qrRNgk}(@c7@DSz0cH<|R<J|e3Ujn6G6J+|s2R%%2%@9kp$
zFq3}3q{sX-%cOs9(jBI|xZ|<gbUdpEI&RK}yIFLBnU1k~V|x$ss7|`I|3<@}Bno$F
zH~(VLkFTGFce~4Pk3t>`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn
zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4
zJXe|WCHIvXdNH>sp4(0NSo<Ec{|Q4rD%0`%dJlBmHi_^$XxQ^*hWL4=Au(RgMLs-~
z6fH;nOnNV@o(cKbyk)F&-^ZYvLLFcz=ptOPMKM&%#xvTqXZ<f_hCY^WPd4c<Jr$)N
zHMp1<HtCl=7p2DrTjztGC;U?xrNi6CBkp$SE*<}A+OyNNXR|5)MGx|Sn)1cF$_z=L
zDK9Pq=+1upEz*de>z<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X
ze(2PwJ+biq#k9ZIEm3-G<hkFZ|LuB{ZaQ$#q|bXbN;l4mf%ma#|K6%7eXuF7VT2~q
zF}4gE%bz;vB0pU7K~#Q_s2K2MfG+a?-Y)rnf+_#jBV~r%i^dl#_$K|0@1yj4OnQSs
zH#HsLX;c0Qvs@Twn!$e;U0ZKA{bBs@tK0PYr3ZSVEgG)3A1E_i-HCQoJQ*haw$I88
z{aQ0a=Noi`)aipCj3M5RjbCE%Hr}9TQ)~q}MxSchfBpWb{R>RnPVT|}wWj=*lcVzg
zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W
z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD
z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k
zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7<n8
z@XtRZ-qxD&RsfoK@YC;Z<0oR!9rW(|WrkcV|19r8ep?Up+dy~WQv20i>b(a|``4S{
zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq|
zf1@d1aY<C(xT6HTD?k_NMdtPjuPOhDA#Z9rz^|YSe=b~DW>7yhdH+U3Cj9B`6Q!GE
z8i<Z-cm5x3&=LR7MCD`toYRB+${y$!_dwrm+W)&*|HOQCUk~#5b20qspvAkQk`I~j
zi9Z+{^dBCJ(qrlPc@OsdVao5D6P16{v?s*{d$@zVQaOpIzwyN;CP9b0zVFHm%10*M
z2f9edX=XZ3FzK}hJ)8EJ`E9OA-)`7rYC6DEJ<#9mf&Oz3^xl1iKaZc++K#U0|7q=7
zfaEBT^B_UMPI5p%E}KVu1PcSPHamMSoiXvA?pD&FlXSj^EM2fiv%7P5D{dd&$LWp)
zM=3)k_z@sZQX*A0Bvdd|pe*umAdW$R6(Ch$Ikt;9#-IuzmDp7&Oenxq3g_#`zdQ5q
z%t??;-R;cu^#AU^yZ`>Dd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9
z6u)+@;2S=0gyEb;<mVa1zxZ2%{}FxqoWk#WRN#gW|ChpDou^ZZPw!*kQP;5na2l6e
z&y#U9bi1ALq33)(%W3{mQ|?jxqrW8hmni%<6Y{?!0sa)=as1&e#_w_d>R+VX52~P|
zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo-
z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k
zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae<
zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv
z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^
zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr?
z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`Vq<B^xI1b`TSJNwe{P6
z7<Q;WhkjS`nbopC4mh=Izqaew6#iL-AGg|jKj73q-d{>SHz@u?4A+~`>7NqdKhk`@
z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2
z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)<k|GtamZ;!KhUzWkX=5vw4
zd!H2eK25X<aBA<-1%a<qc~JwrS8>9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ%
zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy
zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a->
z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si
z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ
zSa(Q6@z<Gee@x(~>9{<h@FAT?uI5j1+R3hE?@PYK_=*%doejdH{NMeo<Ug&?HYz-=
za?rHv#WN`Xi=A_}=0myQolxM@Jnbz?J`{7Bj+;0ixeotQe0VDNZk3yc9_~uO|69es
zL+PJmO(Da_6n^L%S<Zex|2obO0cSqx30%p+Ik#8J{hOZ%9B6!O>?L^j@nMGdIRE-s
zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p
znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ
zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl
z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG
zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-<C
z9tE7}`NwZcJ*Tz3GYm&upvwiW)p4F#FMRJ-&F6G|(9(QPd_(ZBQTS~NfBM@3zhCqJ
zio!Q3oy$|ixl;HO&kJN)^Z5tBX`O!5s{d<>|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom
zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ
zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9<ULob@A||2nPI6%5y#&}lpYey!#+
z_iC20Z_tbmB;bD<aH{{ypA`Hnw2UV-pZ&W9{=C9}n2^s~fYZGD54DFtBjMxH%Z2`*
zeJ#uPdj#wZGhA;%r`ZJfqULkA@)@(<{Ott%KLDKSbJ()89ZSIfw*>f^=n(23=jJTe
zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66
zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0
z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7
zSoIc`s(F9OE8@-mhUd3doI<r+TMC*%F*P(WOmFYUAmJtbO38yyp1%r5gJylznXkh)
z#4EPSW%2?D-Fd)lMo{63xH8}L(m48q!2!OYn@xirJV^F>dAv|hpUy*FYks3qt)vRA
zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)&
zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}<Av#_>ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O
zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM
zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{<e5!>}
zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV=
zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E
zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{
z@Zo@0aK<M`;Se*vYkbCOmRp!UfE9fV?qac(lw1Gs5GH#}F#~DPVC(~zT$$ohxPu7~
zGMGNGDKf;I;<D-^52V-!NoDjWmC>J6#(*om<IfW-Y_6h<{6?Wv@)qH*7SwTja1pGa
zQNtXdKGq)fysc9sJ9A!c*BH36uxN7)Z!IskT@kjAO*!7!zFi|b$47w|VWMzl3IpJ(
zRLAsH+S$Ez>vV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb
zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z
z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn<ixf;p)tmb&R#8LGP(
z*B)}B=#hsXTo0R7u>3lYA-UDTN!4pqi{u<u;)ZC;2wr4nO5AZrl#{l(q+zMU%r?1N
z?Jn#m@5&s67TC6NFzeK@Gi1yK&@1@sJf?Bdnv@#i9$0t8p*9wYs#}?FVKk%_VcL~8
z!VT2%f}+UTUEz<L@;9C3^QJLvb^D{u%JjkQ*9T$9D3JUtmWar5B~#<;gL;+S2&Li%
ztPO!kCP=BOPrKw)bePA|=tK2|lw^${AoAF@r;CiyDw1B^G^}08&Z2F4X2(A9ylm70
zC|K`AhZu$m;n(1xTg*-FD3o<@_U`pCz^ROD+D>kuP}2gx9uys_<+ea&bv6tCXM)t5
zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${
z)QCMkh_AaM-W<M1t@8k63@PT#4KW7_>XT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT
zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj<Fskul&0ob1~i3!tJb33hJIfP
zEPkb=L-h!lPr$?3IH1~vD?7+!3_FcZ9VX>&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7
zn+ApW<uQ0YgCi$&G_g0ZHn@JXzb$n(j|;KbV5Ja%4U=f-?sIacCPt4ow@Srm7sbxI
zwZw<^Ur=FRU8;||T6=SDy0bJ0kq`)qXlkDM3D^XCUwiq^bnPB>i)FaPcA;ivuvkE8
z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7<EYs*0yGtlZ5EOn}w4!jr%kqp}F
z;cM_13N59u7(mnzzW^WP;prk_Lk~$*M8pfBlB5Vk<_M83M*u)DCP>tAOQN3AY{^m8
z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV
zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M)
zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^<eeRveoH5XMgn~)XEH^4b5mCYn(IS@A
zq{@<=fOx?|i<Y-_tT+&(A;Jd0halDGqZxS~*6u+a5gV|Ej6OmcG)<8`{QC>a`;BfI
z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e>
zTWdTCb<(m8X<V`^tgQI?(z5H4Az=y4^(a&ZRP*_2F4957TioF`Z>cW8*ubp5i?uWw
z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr
zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$
zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~;
z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+
zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p
z8)XI<SDEBKNXXW#9ciL0RTnz7RBj4(%aWb+XwolA<yKkQx-iGC6lSeUeF54*KvyLm
z>C=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC
z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;F<I`?LM`&2wpsjQ0=(}<2d)DSVdGX+ft0pT;
zSRMj|Lz*itO-5xlV0~RocQc~$lo6Mw4q<1OSFRViLAp#=x&glk$v!zhxi}5QnSP0u
zZEf$Qv>Xv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~
zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh<jDrtX&S{)f_8-|P2q+*a5mo=;~lAl
zZeJZ!6lOBfD`>&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX
zI<qDxQBC$4id5E3Ezv&eKe*^OdG#{0b5w1n+7NC8_$XFIaZOR?5N10vr6amu60Ih1
z*{+3&<6X1t8I#B;X@J;O)1t9|MA|Z`Rvb#J)0u2jjcm$n#R5_m*WKz`M`xJEk_wB{
z3M_MRO(q4=Dx^)>Z3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK
z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQe<J1q
zUf<9{1R7tk0^1&TKP?5`Qu)bJ&?ucm>o+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ
zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkw<JPlSln{SWk|AoFguQfRZB8LnVC10|uaixO
zpM&s4Ggmw!DPGpGFjnJzSR$GBK=UV*pbjobhG=3>w9^3@(?ul8Dk4L!#M&`0^PrpD
ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE<S;$gjfuwlYth
z(4NlG4cbh+Xl3EnX35-485#|AmpC?JwlwM<+s&>03?uwb!Y6Yj%W@MY0CYL?N=!)V
z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3
zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m
zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0
zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl
z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$
zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonP<sxgd}ccG;jk8py2RMIqdH9#NcE0
z>ei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0<I
zlgd}E!yp~C9qTGCmn(|(cf_*(Mybg#;sgmmTcLRpe^FZL3>X5I&g{hWH!n{?&8Anw
zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k
zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5
zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk
zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg
zM+<HF@IFu<-q6dD&GK!|$>k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y&
z<F|#(zOh!W&eRlBm70TSmd4J}`I!9zT9naNKdsB=nIlK%SDOMB!xRuIImE?5iZkO~
z%C_dGrUjm_QN5{a$H_2_$nrbhdgVB&#%dX%nsJ7%%;!aOg)rGcz2>AUFnXo1^}h*Y
zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd
zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~
z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^
zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w)
z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C
z4c|I6<sV*`W&FbmxR&qmIR`zTrDt>6g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`}
z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu
z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P
zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD
zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA*
zPq}siGxH68?DpRa7-5?6+rNH<QTJ>52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl
zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*4<x}&*
z?Kkm;KQbigkA>3GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM
zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~<d;7x2@M|5_%P|s`C_a7
z?fA-BNnbM+OGtm2mEQO}*G46Muck96Q?9xH5YiLO6l3B!-m2;8i)G=XH++L@nkM*j
z(iYsLAF$wrONTjAWO{QgPvrNQmcKs?#I=n(o{)aym{dGxHP}u+n~?s%grqnA^QIbh
z`a(kb^}8kg=~n;Q>8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ
z&%Uxx%D>Yf>e<FU7jUbJjQPF$CH)tz1lBE=ZFLc+-##bMCoh+R>2v`9W;hLBeHUPH
S<)8BzNq<b`kmS<c*8c@<49|W5

literal 0
HcmV?d00001

diff --git a/legacy/dsaX_beamformer_passon.cu b/legacy/dsaX_beamformer_passon.cu
new file mode 100644
index 0000000..818c28a
--- /dev/null
+++ b/legacy/dsaX_beamformer_passon.cu
@@ -0,0 +1,1057 @@
+// -*- c++ -*-       
+/* will implement the 64-input beamformer 
+
+does N beams of 256
+
+order is (taking time as 8x 8.192e-6) 
+[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
+Load in 16 times at a time, so that we have (in units of what needs to be added)
+[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
+
+This should be reordered on the cpu to 
+[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
+
+The first kernel, launched with 1536 blocks of 64 threads, needs to
+ - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
+ - each thread processes 4 beams, adding everything. for each beam,
+  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
+  + add everything into output array
+Output array has order [beam, 96 frequency, 16 time]
+
+Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
+
+Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
+these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
+after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
+
+Do everything in floating point until second kernel. 
+
+Second kernel will simply add times and adjacent channels and pick leading 8 bits
+Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
+
+ */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <mma.h>
+#include <cuda.h>
+#include "cuda_fp16.h"
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+#include <cuda_runtime_api.h>
+using namespace nvcuda;
+
+#define sep 1.0
+
+// global variables
+int DEBUG = 0;
+
+
+// kernel for summing and requantizing
+// input array has order [beam, 48 frequency, 2 pol, 16 time]
+// need to output to [4 time, beam, 48 frequency]
+// bp is scale factor for each beam 
+// run with 256*48=12288 blocks and 32 threads
+__global__
+void adder(float *input, unsigned char *output, float *bp) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 256*48=12288
+  int tidx = threadIdx.x; // assume 32
+  //int fidx = 2*(bidx % 24);
+  int beamidx = (int)(bidx / 48);
+  
+  // declare shared mem
+  __shared__ float data[32]; // data block to be summed  
+
+  // transfer from input to shared mem
+  data[tidx] = input[bidx*32];
+
+  // sync
+  __syncthreads();
+
+  // complete sum
+  if (tidx<16) {
+    data[tidx] += data[tidx+16]; // over pols
+
+    data[tidx] += data[tidx+2];
+    data[tidx] += data[tidx+1];
+  }
+  // now tidx = 0, 4, 8, 12 are what we want! 
+
+  __syncthreads();
+  
+  // store
+  if (tidx == 0) 
+    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
+  if (tidx == 4) 
+    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
+  if (tidx == 8) 
+    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
+  if (tidx == 12) 
+    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
+      
+}
+
+// kernel for promotion
+/*
+orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
+promoted to half precision  
+
+launch with 16*48*NANT blocks of 32 threads
+
+ */
+__global__ void promoter(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume 16*48*NANT
+  int tidx = threadIdx.x; // assume 32
+  int iidx = bidx*32+tidx;
+  int pol = (int)(tidx % 2);
+  int chunnel = (int)(tidx / 2);
+  
+  /*int ant = (int)(bidx % NANT);
+  int time_chan = (int)(bidx / NANT);    
+  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
+
+  int chan = (int)(bidx % 48);
+  int time_ant = (int)(bidx / 48);
+  int tim = (int)(time_ant / NANT);
+  int ant = (int)(time_ant % NANT);
+  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
+
+  inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
+  ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
+
+}
+
+// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
+// for first time, launch with 3072, 32
+__global__ void printer(half *inr, half *ini) {
+
+  int idx = blockIdx.x*32+threadIdx.x;
+  float ir = __half2float(inr[idx]);
+  float ii = __half2float(ini[idx]);
+
+  int chunnel = (int)(threadIdx.x % 16);
+  int channel = (int)(blockIdx.x/64);
+  int tt = (int)(blockIdx.x % 64);
+  int pol = (int)(tt/32);
+  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
+  
+  if (ir!=0. || ii!=0.) {
+    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
+  }
+  
+}
+
+
+// kernel for beamforming
+/*
+
+Assumes that up to NANT antennas (nominally 63) are populated. 
+
+Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
+
+Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
+
+Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
+for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
+use __float2int_rn, cosf, sinf intrinsics. 
+
+Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
+Do it in tiles of 16 beams and 16 ants for 
+
+Output array has order [beam, 48 frequency, 2 pol, 16 time]
+
+inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
+
+launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
+ = 24576 blocks
+
+*/
+__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
+
+  // get block and thread ids
+  int bidx = blockIdx.x; // assume 24576
+  int tidx = threadIdx.x; // assume 32
+  int orig_bidx = (int)(bidx / 16);
+  int beam_tile = (int)(bidx % 16);
+  int stuff_tile = (int)(beam_tile % 4);
+  int data_offset = orig_bidx*1024; // offset for first part of data
+  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
+  weight_offset *= 16384;
+  int idx1, idx2;
+  int f_idx = (int)(orig_bidx % 96);
+  int tim_idx = (int)(orig_bidx / 96);
+  int oidx = f_idx*16 + tim_idx;
+  
+  // shared memory for convenience
+  __shared__ float summr[16][16]; // beam, chunnel
+  __shared__ float summi[16][16]; // beam, chunnel
+  
+  // accumulate real and imag parts into [16 beam x 16 f] fragments
+  // Declare the fragments.
+  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wr_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wr_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wi_inr_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wi_ini_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
+  
+  // zero out accumulators
+  wmma::fill_fragment(wr_inr_frag, 0.0f);
+  wmma::fill_fragment(wr_ini_frag, 0.0f);
+  wmma::fill_fragment(wi_inr_frag, 0.0f);
+  wmma::fill_fragment(wi_ini_frag, 0.0f);
+  wmma::fill_fragment(ib_frag, 0.0f);
+
+  // IB
+  if (stuffants==2) {
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
+    
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+
+      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
+
+    }
+
+  }
+
+  // one ant per beam
+  if (stuffants==1) {        
+
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
+    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
+    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
+    
+  }
+  if (stuffants!=1) {
+  
+    // loop over ant tiles
+    for (int ant_tile=0; ant_tile<4; ant_tile++) {
+      
+      // copy weight and data to fragments, and multiply to accumulators
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
+      
+      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
+      
+      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
+      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
+      
+      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
+      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
+      
+    }
+
+    // form real and imaginary matrices
+    for(int i=0; i < wr_inr_frag.num_elements; i++) {
+      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
+      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
+      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
+    }
+  }
+
+  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
+    
+  // copy back to shared mem
+  float *p1, *p2, tmp;
+  p1 = &summr[0][0];
+  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
+
+  if (stuffants!=1) {
+  
+    // do thread reduction for each beam
+    if (tidx<8) {
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2];
+      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1];
+    }
+    if (tidx>=8 && tidx<16) {
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8];
+      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8];  
+    }
+    if (tidx>=16 && tidx<24) {
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16];
+      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16];  
+    }
+    if (tidx>=24) {
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24];
+      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24];  
+    }
+
+    __syncthreads();
+    
+    // now summr[beam][0] can go into output
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0];
+    }
+
+  }
+
+  if (stuffants==1) {
+    if (tidx<16) {
+      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
+    }
+  }
+  if (stuffants==2) {
+
+    p2 = &summi[0][0];
+    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
+    tmp = 0.;
+    for (int i=0;i<16;i++) tmp += summi[i][i];
+    if (tidx==0 && beam_tile==0) 
+      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
+
+  }      
+  
+}
+
+// kernel to calculate weights - needed because weights are halfs
+// launch with 256 threads in 6144 blocks
+__global__
+void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
+
+  // assume 256 threads in 6144 blocks
+  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
+  int tidx = threadIdx.x;
+  int f = (int)(bidx / 128);
+  int cc = (int)(bidx % 128);
+  int pol = (int)(cc / 64);
+  cc = (int)(cc % 64);
+  int beam_tile = (int)(cc / 4);
+  int ant_tile = (int)(cc % 4);
+  int beam_i = (int)(tidx / 16);
+  int ant_i = (int)(tidx % 16);
+
+  int beam = beam_tile*16+beam_i;
+  int ant = ant_tile*16+ant_i;
+  int i = bidx*256+tidx;
+  int widx = ant*NW*2*2 + f*2*2 + pol*2;
+  
+  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
+  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
+  float twr = cos(afac*antpos[ant]);
+  float twi = sin(afac*antpos[ant]);
+
+  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
+  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
+  
+  
+}  
+ 
+  
+// function prototypes
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2);
+int dada_bind_thread_to_core (int core);
+int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
+void reorder_block(char *block);
+void calc_bp(float *data, float *bp, int pr);
+
+
+// performs massive summation to calculate bp
+// input array has order [beam, 96 frequency, 16 time]
+// bp has size 48 - no way to avoid strided memory access
+// returns factor to correct data
+void calc_bp(float *data, float *bp, int pr) {
+
+  int i=0;
+  
+  for (int b=0;b<256;b++) {
+    for (int f=0;f<48;f++) {
+      for (int a=0;a<32;a++) {
+	bp[b] += data[i];
+	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
+	i++;
+      }
+    }
+  }
+
+}
+
+// performs cpu reorder of block to be loaded to GPU
+void reorder_block(char * block) {
+
+  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+  // 24576*NANT in total. 1536*NANT per time
+  
+  char * output = (char *)malloc(sizeof(char)*24576*NANT);
+  
+  for (int i=0;i<16;i++) { // over time
+    for (int j=0;j<NANT;j++) { // over ants
+      for (int k=0;k<48;k++) { // over channels
+
+	// copy 32 bytes
+	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
+	
+      }
+    }
+  }
+
+  memcpy(block,output,24576*NANT);
+  free(output);
+
+}
+
+
+// loads in weights
+int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
+
+  // assumes 64 antennas
+  // antpos: takes only easting
+  // weights: takes [ant, NW==48] 
+
+  FILE *fin;
+  FILE *fants;
+  
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
+    return 1;
+  }
+  if (!(fants=fopen(flagants,"r"))) {
+    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
+    return 1;
+  }
+
+  fread(antpos,64*sizeof(float),1,fin);
+  fread(weights,64*NW*2*2*sizeof(float),1,fin);
+  float wnorm;
+  for (int i=0;i<64*NW*2;i++) {
+    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
+    if (wnorm!=0.0) {
+      weights[2*i] /= wnorm;
+      weights[2*i+1] /= wnorm;
+    }
+  }
+	
+
+  int ant;
+  while (!feof(fants)) {
+    fscanf(fants,"%d\n",&ant);
+    for (int j=0;j<NW*2*2;j++) {
+      weights[ant*NW*2*2+j] = 0.0;
+    }
+  }
+      
+  fclose(fants);
+  fclose(fin);
+  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
+  return 0;
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  if (dada_hdu_unlock_write (out2) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out2");
+    }
+  dada_hdu_destroy (out2);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_beamformer [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f filename for antenna stuff [no default]\n"
+	   " -i input key [default REORDER_BLOCK_KEY2]\n"
+	   " -o output key [default BF_BLOCK_KEY]\n"
+	   " -g output key 2 [no default]\n"	   
+	   " -z fch1 in MHz [default 1530]\n"
+	   " -a flagants file\n"
+	   " -s stuffants \n"
+	   " -q do incoherent beam \n"
+	   " -t test pattern \n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // device properties
+  int nDevices;
+
+  cudaGetDeviceCount(&nDevices);
+  for (int i = 0; i < nDevices; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    syslog(LOG_INFO,"Device Number: %d", i);
+    syslog(LOG_INFO,"  Device name: %s", prop.name);
+    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
+  }
+  cudaSetDevice(1);
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY2;
+  key_t out_key = BF_BLOCK_KEY, out_key2 = BF_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int stuffants=0;
+  int test_pattern = 0;
+  float fch1 = 1530.0;
+  char * fnam;
+  fnam=(char *)malloc(sizeof(char)*100);
+  sprintf(fnam,"nofile");  
+  char * flagants;
+  flagants=(char *)malloc(sizeof(char)*100);
+  sprintf(flagants,"nofile");  
+
+  while ((arg=getopt(argc,argv,"c:f:i:o:g:z:a:tsqdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'g':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key2) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'a':
+	  if (optarg)
+	    {
+	      strcpy(flagants,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'z':
+	  if (optarg)
+	    {
+	      fch1 = atof(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-z flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 't':
+	  test_pattern=1;
+	  syslog (LOG_INFO, "Will execute test pattern");
+	  break;
+	case 's':
+	  stuffants=1;
+	  syslog (LOG_INFO, "Will place antennas in output");
+	  break;
+	case 'q':
+	  stuffants=2;
+	  syslog (LOG_INFO, "Will place IB in output");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // print stuff
+  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
+  syslog(LOG_INFO,"Using calibrations file %s",fnam);
+  syslog(LOG_INFO,"Using flagants file %s",flagants);
+
+  // load in weights and antpos
+  float * antpos = (float *)malloc(sizeof(float)*64); // easting
+  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
+  float * freqs = (float *)malloc(sizeof(float)*384); // freq
+  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out2  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out2, out_key2);
+  if (dada_hdu_connect (hdu_out2) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out2) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  int nints = NPACKETS / 16;
+  uint64_t nbytes_per_int = block_size / nints;
+  uint64_t nbytes_per_out = block_out / nints;
+  char * block;
+  unsigned char * output_buffer;
+  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+  
+  // allocate host and device memory for calculations
+  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
+  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
+  char *d_indata[NSTREAMS];
+  unsigned char *d_outdata[NSTREAMS];
+  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
+  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
+  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
+  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
+  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
+  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
+  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
+  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
+  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
+  
+  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
+  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
+  float *bp = (float *)malloc(sizeof(float)*256);
+  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
+  
+  // streams and device  
+  cudaStream_t stream[NSTREAMS];
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamCreate(&stream[st]);
+    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
+    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
+    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
+    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
+    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
+    thrust::device_ptr<half> d1(d_inr[st]);
+    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
+    thrust::device_ptr<half> d2(d_ini[st]);
+    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
+  }
+
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  int blockct = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    blockct ++;
+
+    // write to output
+    /*    written = ipcio_write (hdu_out2->data_block, block, block_out2);
+    if (written < block_out2)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+	}*/
+    
+    // DO STUFF
+
+    // calc weights
+    init_weights(fnam,antpos,weights,flagants);
+    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
+    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
+    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
+    
+    if (started==1) {
+
+      // loop over ints
+      for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+	for (int st=0;st<NSTREAMS;st++) {
+
+
+	  
+	  // copy to h_indata
+	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
+
+	  // rotate h_indata in place
+	  //reorder_block(h_indata);
+	  
+	  // copy to device
+	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	  // do promotion
+	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+	  
+	  // run beamformer kernel
+	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  	  
+	  // run adder kernel
+	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
+	  
+	  // copy to host
+	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
+
+	  // copy to output
+	  for (int j=0;j<12288*4;j++) {
+	    if (test_pattern) 
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
+	    else
+	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
+	  }
+	  if (DEBUG && bst*NSTREAMS+st==10) {
+	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
+	  }        
+	  
+	}
+      }
+
+
+    }
+    
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+
+      // calculate bandpass
+
+      for (int i=0;i<256;i++) bp[i] = 0.;
+      
+      // do standard bf but calculate bandpass
+
+      // loop over ints
+      for (int bst=0;bst<nints/NSTREAMS;bst++) {
+
+	for (int st=0;st<NSTREAMS;st++) {
+	  
+	  // copy to h_indata
+	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
+
+	  // rotate h_indata in place - this is current
+	  //reorder_block(h_indata);
+
+	  // copy to device
+	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
+
+	  // do promotion
+	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
+
+	  //if (bst==0 && st==0) 
+	  //  printer<<<3072, 32>>>(d_inr,d_ini);	  
+	  
+	  // run beamformer kernel
+	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
+	  
+	  // copy back to host
+	  cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
+
+	  // calculate bandpass
+	  //if (st==0 && bst==0) 
+	  //calc_bp(h_transfer,bp,1);
+	  calc_bp(h_transfer + st*256*96*16,bp,0);
+
+	}
+      }
+
+      // adjust bandpass
+      syslog(LOG_INFO,"Final BP...");
+      for (int i=0;i<256;i++) {
+	syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
+	if (bp[i]!=0.) {
+	  bp[i] /= 48.*nints; 
+	  bp[i] = 128./bp[i]/4.;
+	}
+      }
+      cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
+      
+      // junk into output
+      memset(output_buffer,0,block_out);
+      
+    }
+
+    // write output for debug
+    
+    // write to output
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  for (int st=0;st<NSTREAMS;st++) {
+    cudaStreamDestroy(stream[st]);
+    cudaFree(d_indata[st]);
+    cudaFree(d_outdata[st]);
+    cudaFree(d_transfer[st]);
+    cudaFree(d_inr[st]);
+    cudaFree(d_ini[st]);
+  }
+  free(fnam);
+  free(flagants);
+  free(h_indata);
+  free(output_buffer);
+  free(antpos);
+  free(weights);
+  free(freqs);
+  free(bp);
+  free(h_transfer);
+  free(tmp_buf);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_antpos);
+  cudaFree(d_freqs);
+  cudaFree(d_weights);
+  cudaFree(d_wr);
+  cudaFree(d_wi);
+  cudaFree(d_bp);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+  
+}
+
+
diff --git a/legacy/dsaX_bfCorr.cu b/legacy/dsaX_bfCorr.cu
new file mode 100644
index 0000000..25b9262
--- /dev/null
+++ b/legacy/dsaX_bfCorr.cu
@@ -0,0 +1,1286 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+
+#include <cuda.h>
+#include "cuda_fp16.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
+
+/* global variables */
+int DEBUG = 1;
+
+// define structure that carries around device memory
+typedef struct dmem {
+
+  // initial data and streams
+  char * h_input; // host input pointer
+  char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  
+  // correlator pointers
+  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times]
+  half * d_r, * d_i;
+  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
+  half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi;
+  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+  float * d_output;
+  
+  // beamformer pointers
+  char * d_big_input;
+  half * d_br, * d_bi;
+  half * weights_r, * weights_i; //weights: [arm, tactp, b]
+  half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b]
+  unsigned char * d_bigpower; //output: [b, tc]
+  float * d_scf; // scale factor per beam
+  float * d_chscf;
+  float * h_winp;
+  int * flagants, nflags;
+  float * h_freqs, * d_freqs;
+
+  // timing
+  float cp, prep, cubl, outp;
+  
+} dmem;
+
+
+// allocate device memory
+void initialize(dmem * d, int bf) {
+  
+  // for correlator
+  if (bf==0) {
+    cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+    cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+    cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  }
+
+  // for beamformer
+  if (bf==1) {
+    cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+    cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+    cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
+    cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
+    cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
+
+    // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
+    d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
+    d->flagants = (int *)malloc(sizeof(int)*NANTS);
+    d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
+
+    // timers
+    d->cp = 0.;
+    d->prep = 0.;
+    d->outp = 0.;
+    d->cubl = 0.;
+    
+  }
+  
+}
+
+// deallocate device memory
+void deallocate(dmem * d, int bf) {
+
+  cudaFree(d->d_input);
+
+  if (bf==0) {
+    cudaFree(d->d_r);
+    cudaFree(d->d_i);
+    cudaFree(d->d_tx);
+    cudaFree(d->d_output);
+    cudaFree(d->d_outr);
+    cudaFree(d->d_outi);
+    cudaFree(d->d_tx_outr);
+    cudaFree(d->d_tx_outi);
+  }
+  if (bf==1) {
+    cudaFree(d->d_tx);
+    cudaFree(d->d_br);
+    cudaFree(d->d_bi);
+    cudaFree(d->weights_r);
+    cudaFree(d->weights_i);
+    cudaFree(d->d_bigbeam_r);
+    cudaFree(d->d_bigbeam_i);
+    cudaFree(d->d_bigpower);
+    cudaFree(d->d_scf);
+    cudaFree(d->d_chscf);
+    free(d->h_winp);
+    free(d->flagants);
+    cudaFree(d->d_freqs);
+    free(d->h_freqs);
+  }
+  
+}
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+} 
+
+
+void usage()
+{
+fprintf (stdout,
+	 "dsaX_bfCorr [options]\n"
+	 " -c core   bind process to CPU core [no default]\n"
+	 " -d send debug messages to syslog\n"
+	 " -i in_key [default REORDER_BLOCK_KEY]\n"
+	 " -o out_key [default XGPU_BLOCK_KEY]\n"
+	 " -b run beamformer [default is to run correlator]\n"
+	 " -h print usage\n"
+	 " -t binary file for test mode\n"
+	 " -f flagants file\n"
+	 " -a calib file\n"
+	 " -s start frequency (assumes -0.244140625MHz BW)\n");
+}
+
+// kernel to fluff input
+// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
+__global__ void corr_input_copy(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
+  int tidx = threadIdx.x; // assume 128
+  int iidx = bidx*128+tidx;
+  
+  inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
+  ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
+
+}
+
+
+// arbitrary transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_char(char * idata, char * odata) {
+
+  __shared__ char tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+// arbitrary transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_float(half * idata, half * odata) {
+
+  __shared__ half tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+// arbitrary transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) {
+
+  __shared__ in_prec tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+
+// function to copy and reorder d_input to d_r and d_i
+// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
+// then fluffs using simple kernel
+void reorder_input(char *input, char * tx, half *inr, half *ini) {
+
+  // transpose input data
+  dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
+  transpose_matrix_char<<<dimGrid,dimBlock>>>(input,tx);
+  /*
+  // set up for geam
+  cublasHandle_t cublasH = NULL;
+  cudaStream_t stream = NULL;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  cublasSetStream(cublasH, stream);
+
+  // transpose input matrix into tx
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  const int m = NPACKETS_PER_BLOCK * NANTS;
+  const int n = NCHAN_PER_PACKET*2*2/8; // columns in output
+  const double alpha = 1.0;
+  const double beta = 0.0;
+  const int lda = n;
+  const int ldb = m;
+  const int ldc = ldb;
+  cublasDgeam(cublasH,transa,transb,m,n,
+	      &alpha,(double *)(input),
+	      lda,&beta,(double *)(tx),
+	      ldb,(double *)(tx),ldc);
+  */
+  // now we just need to fluff to half-precision
+  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128,128>>>(tx,inr,ini);
+
+  // look at output
+  /*char * odata = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2);
+  cudaMemcpy(odata,inr,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,cudaMemcpyDeviceToHost);
+  FILE *fout;
+  fout=fopen("test.test","wb");
+  fwrite(odata,1,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,fout);
+  fclose(fout);*/
+  
+  // destroy stream
+  //cudaStreamDestroy(stream);
+  
+}
+
+// kernel to help with reordering output
+// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
+// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
+
+  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+  
+  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
+  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
+  int ch = (int)(chpol / 2);
+  int base_idx = indices_lookup[baseline];
+  int iidx = base_idx * NCHAN_PER_PACKET + ch;
+  int pol = (int)(chpol % 2);
+
+  float v1=0., v2=0.;
+  
+  for (int i=0;i<halfFac;i++) {
+    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
+    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
+  }
+
+  output[2*idx] = v1;
+  output[2*idx+1] = v2;
+  
+}
+
+
+// function to copy d_outr and d_outi to d_output
+// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
+// the corr matrices are column major order
+// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
+void reorder_output(dmem * d) {
+
+  // transpose input data
+  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32);
+  transpose_matrix_float<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
+  transpose_matrix_float<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
+
+  // look at output
+  /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
+  cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
+  FILE *fout;
+  fout=fopen("test2.test","wb");
+  fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout);
+  fclose(fout);*/
+
+  
+  /*
+  // set up for geam
+  cublasHandle_t cublasH = NULL;
+  cudaStream_t stream = NULL;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  cublasSetStream(cublasH, stream);
+
+  // transpose output matrices into tx_outr and tx_outi
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  const int m = NCHAN_PER_PACKET*2*2;
+  const int n = NANTS*NANTS/16; // columns in output
+  const double alpha = 1.0;
+  const double beta = 0.0;
+  const int lda = n;
+  const int ldb = m;
+  const int ldc = ldb;
+  cublasDgeam(cublasH,transa,transb,m,n,
+	      &alpha,(double *)(d->d_outr),
+	      lda,&beta,(double *)(d->d_tx_outr),
+	      ldb,(double *)(d->d_tx_outr),ldc);
+  cublasDgeam(cublasH,transa,transb,m,n,
+	      &alpha,(double *)(d->d_outi),
+	      lda,&beta,(double *)(d->d_tx_outi),
+	      ldb,(double *)(d->d_tx_outi),ldc);
+  */
+  // now run kernel to sum into output
+  int * h_idxs = (int *)malloc(sizeof(int)*NBASE);
+  int * d_idxs;
+  cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE);
+  int ii = 0;
+  // upper triangular order (column major) to match xGPU (not the same as CASA!)
+  for (int i=0;i<NANTS;i++) {
+    for (int j=0;j<=i;j++) {
+      h_idxs[ii] = i*NANTS + j;
+      ii++;
+    }
+  }
+  cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice);
+
+  // run kernel to finish things
+  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs);
+
+  /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
+  cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
+  FILE *fout;
+  fout=fopen("test3.test","wb");
+  fwrite(odata,sizeof(char),384*4*NBASE*4,fout);
+  fclose(fout);*/
+
+  
+  cudaFree(d_idxs);
+  free(h_idxs);
+  //cudaStreamDestroy(stream);  
+
+}
+
+
+
+// correlator function
+// workflow: copy to device, reorder, stridedBatchedGemm, reorder
+void dcorrelator(dmem * d) {
+
+  // zero out output arrays
+  cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  
+  // copy to device
+  cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice);
+
+  // reorder input
+  reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i);
+
+  // not sure if essential
+  cudaDeviceSynchronize();
+  
+  // set up for gemm
+  cublasHandle_t cublasH = NULL;
+  cudaStream_t stream = NULL;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  cublasCreate(&cublasH);
+  cublasSetStream(cublasH, stream);
+
+  // gemm settings
+  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] 
+  cublasOperation_t transa = CUBLAS_OP_N;
+  cublasOperation_t transb = CUBLAS_OP_T;
+  const int m = NANTS;
+  const int n = NANTS;
+  const int k = NPACKETS_PER_BLOCK/halfFac;
+  const half alpha = 1.;
+  const half malpha = -1.;
+  const int lda = m;
+  const int ldb = n;
+  const half beta0 = 0.;
+  const half beta1 = 1.;
+  const int ldc = m;
+  const long long int strideA = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  const long long int strideB = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  const long long int strideC = NANTS*NANTS;
+  const int batchCount = NCHAN_PER_PACKET*2*2*halfFac;
+
+  // run strided batched gemm
+  // ac
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_r,lda,strideA,
+			    d->d_r,ldb,strideB,&beta0,
+			    d->d_outr,ldc,strideC,
+			    batchCount);
+  // bd
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_i,lda,strideA,
+			    d->d_i,ldb,strideB,&beta1,
+			    d->d_outr,ldc,strideC,
+			    batchCount);
+  // -bc
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &malpha,d->d_i,lda,strideA,
+			    d->d_r,ldb,strideB,&beta0,
+			    d->d_outi,ldc,strideC,
+			    batchCount);
+  // ad
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_r,lda,strideA,
+			    d->d_i,ldb,strideB,&beta1,
+			    d->d_outi,ldc,strideC,
+			    batchCount);
+
+  // shown to be essential
+  cudaDeviceSynchronize();
+
+  // destroy stream
+  cudaStreamDestroy(stream);
+  cublasDestroy(cublasH);
+  
+  // reorder output data
+  reorder_output(d);
+  
+}
+
+// kernels to reorder and fluff input data for beamformer
+// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
+// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]      // run as 16x16 tiled transpose with 32-byte words 
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
+// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
+// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
+__global__ void transpose_input_bf(double * idata, double * odata) {
+
+  __shared__ double tile[16][17][4];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
+    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
+    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
+    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
+    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
+    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
+    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
+  }
+
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+__global__ void fluff_input_bf(char * input, half * dr, half * di) {
+
+  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+
+  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+  
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) {
+
+  __shared__ float tile[16][17];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+  float dr, di;
+
+  for (int j = 0; j < 16; j += 8) {
+    dr = (float)(ir[(y+j)*width + x]);
+    di = (float)(ii[(y+j)*width + x]);
+    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8)
+    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+__global__ void sum_beam(unsigned char * input, float * output) {
+
+  __shared__ float summ[512];
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  int idx = bidx*256+tidx;
+  int bm = (int)(bidx/48);
+  int ch = (int)(bidx % 48);
+
+  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
+
+  __syncthreads();
+
+  if (tidx<256) {
+    summ[tidx] += summ[tidx+256];
+    summ[tidx] += summ[tidx+128];
+    summ[tidx] += summ[tidx+64];
+    summ[tidx] += summ[tidx+32];
+    summ[tidx] += summ[tidx+16];
+    summ[tidx] += summ[tidx+8];
+    summ[tidx] += summ[tidx+4];
+    summ[tidx] += summ[tidx+2];
+    summ[tidx] += summ[tidx+1];
+  }
+
+  if (tidx==0) output[bidx] = summ[tidx];
+  
+}
+
+/*
+Beamformer:
+ - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
+ - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+ - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
+(single transpose operation)
+ - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
+ - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
+ - transpose and done! 
+
+*/
+// beamformer function
+void dbeamformer(dmem * d) {
+
+  // gemm settings - recall column major order assumed
+  // stride over 48 chans
+  cublasHandle_t cublasH = NULL;
+  cublasCreate(&cublasH);
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  const int m = NPACKETS_PER_BLOCK/4;
+  const int n = NBEAMS/2;
+  const int k = 4*(NANTS/2)*8*2*2;
+  const half alpha = 1.;
+  const half malpha = -1.;
+  const int lda = k;
+  const int ldb = k;
+  const half beta0 = 0.;
+  const half beta1 = 1.;
+  const int ldc = m;
+  const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
+  const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
+  const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
+  const int batchCount = NCHAN_PER_PACKET/8;
+  long long int i1, i2, o1;
+  
+  // create streams
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // timing
+  // copy, prepare, cublas, output
+  clock_t begin, end;
+
+  // do big memcpy
+  begin = clock();
+  cudaMemcpy(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4,cudaMemcpyHostToDevice);
+  end = clock();
+  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+  
+  // loop over halves of the array
+  for (int iArm=0;iArm<2;iArm++) {
+  
+    // zero out output arrays
+    cudaMemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    cudaMemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    cudaDeviceSynchronize();
+    
+    // copy data to device
+    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+    // final data: need to split by NANTS.
+    begin = clock();
+    for (i1=0;i1<NPACKETS_PER_BLOCK;i1++) 
+      cudaMemcpy(d->d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,(NANTS/2)*NCHAN_PER_PACKET*4,cudaMemcpyDeviceToDevice);
+    end = clock();
+    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // do reorder and fluff of data to real and imag
+    begin = clock();
+    dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
+    transpose_input_bf<<<dimGrid1,dimBlock1>>>((double *)(d->d_input),(double *)(d->d_tx));
+    fluff_input_bf<<<NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128,128>>>(d->d_tx,d->d_br,d->d_bi);
+    end = clock();
+    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
+
+    // large matrix multiply to get real and imag outputs
+    // set up for gemm
+    cublasSetStream(cublasH, stream);
+    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
+          
+    // run strided batched gemm
+    begin = clock();
+    // ac
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_br,lda,strideA,
+			      d->weights_r+i2,ldb,strideB,&beta0,
+			      d->d_bigbeam_r,ldc,strideC,
+			      batchCount);
+    // -bd
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &malpha,d->d_bi,lda,strideA,
+			      d->weights_i+i2,ldb,strideB,&beta1,
+			      d->d_bigbeam_r,ldc,strideC,
+			      batchCount);
+    // bc
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_bi,lda,strideA,
+			      d->weights_r+i2,ldb,strideB,&beta0,
+			      d->d_bigbeam_i,ldc,strideC,
+			      batchCount);
+    // ad
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_br,lda,strideA,
+			      d->weights_i+i2,ldb,strideB,&beta1,
+			      d->d_bigbeam_i,ldc,strideC,
+			      batchCount);
+      
+    cudaDeviceSynchronize();
+    end = clock();
+    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
+      
+        
+    // simple formation of total power and scaling to 8-bit in transpose kernel
+    begin = clock();
+    dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16);
+    transpose_scale_bf<<<dimGrid,dimBlock>>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    end = clock();
+    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
+      
+
+  }
+
+  cudaStreamDestroy(stream);
+
+
+  cublasDestroy(cublasH);
+
+  // form sum over times
+  //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf);
+  
+}
+
+// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
+// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  int inidx = bidx*128+tidx;  
+  
+  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
+  
+  // get indices
+  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
+  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
+  int bm = (int)(idx / (128*(NANTS/2)));
+  int tactp = (int)(idx % (128*(NANTS/2)));
+  int t = (int)(tactp / (32*(NANTS/2)));
+  int actp = (int)(tactp % (32*(NANTS/2)));
+  int a = (int)(actp / 32);
+  int ctp = (int)(actp % 32);
+  int c = (int)(ctp / 4);
+  int tp = (int)(ctp % 4);
+  int t2 = (int)(tp / 2);
+  int pol = (int)(tp % 2);
+  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
+  
+  // calculate weights
+  float theta, afac, twr, twi;
+  if (iArm==0) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_e[a+48*iArm]);
+    twi = sin(afac*antpos_e[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+  if (iArm==1) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_n[a+48*iArm]);
+    twi = sin(afac*antpos_n[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+    
+}
+
+// GPU-powered function to populate weights matrix for beamformer
+// file format:
+// sequential pairs of eastings and northings
+// then [NANTS, 48, R/I] calibs
+
+void calc_weights(dmem * d) {
+
+  // allocate
+  float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
+  float *antpos_n = (float *)malloc(sizeof(float)*NANTS);
+  float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+  float *d_antpos_e, *d_antpos_n, *d_calibs;
+  float wnorm;
+  cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+
+  // deal with antpos and calibs
+  int iant, found;
+  for (int i=0;i<NANTS;i++) {
+    antpos_e[i] = d->h_winp[2*i];
+    antpos_n[i] = d->h_winp[2*i+1];
+  }
+  for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
+
+    iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
+
+    found = 0;
+    for (int j=0;j<d->nflags;j++)
+      if (d->flagants[j]==iant) found = 1;
+
+    calibs[2*i] = d->h_winp[2*NANTS+2*i];
+    calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1];
+
+    wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]);
+    if (wnorm!=0.0) {
+      calibs[2*i] /= wnorm;
+      calibs[2*i+1] /= wnorm;
+    }
+
+    //if (found==1) {
+    //calibs[2*i] = 0.;
+    //calibs[2*i+1] = 0.;
+    //}
+  }
+
+  //for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) printf("%f %f\n",calibs[2*i],calibs[2*i+1]);
+  
+  cudaMemcpy(d_antpos_e,antpos_e,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_antpos_n,antpos_n,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
+
+  // run kernel to populate weights matrix
+  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs);  
+  
+  // free stuff
+  cudaFree(d_antpos_e);
+  cudaFree(d_antpos_n);
+  cudaFree(d_calibs);
+  free(antpos_e);
+  free(antpos_n);
+  free(calibs);
+  
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  cudaSetDevice(1);
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY;
+  key_t out_key = XGPU_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int bf = 0;
+  int test = 0;
+  char ftest[200], fflagants[200], fcalib[200];
+  float sfreq = 1498.75;
+
+  
+  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+            {
+	      test = 1;
+	      syslog(LOG_INFO, "test mode");
+	      if (sscanf (optarg, "%s", &ftest) != 1) {
+		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'a':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "read calib file %s",optarg);
+	      if (sscanf (optarg, "%s", &fcalib) != 1) {
+		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
+	      if (sscanf (optarg, "%s", &fflagants) != 1) {
+		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 's':
+	  if (optarg)
+            {
+	      sfreq = atof(optarg);
+	      syslog(LOG_INFO, "start freq %g",sfreq);
+ 	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-s flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'b':
+	  bf=1;
+	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  // allocate device memory
+  dmem d;
+  initialize(&d,bf);
+
+  // set up for beamformer
+  FILE *ff;
+  int iii;
+  if (bf) {
+
+    if (!(ff=fopen(fflagants,"r"))) {
+      syslog(LOG_ERR,"could not open flagants file\n");
+      exit(1);
+    }
+    d.nflags=0;
+    while (!feof(ff)) {
+      fscanf(ff,"%d\n",&d.flagants[iii]);
+      d.nflags++;
+    }
+    fclose(ff);
+
+    if (!(ff=fopen(fcalib,"rb"))) {
+      syslog(LOG_ERR,"could not open calibss file\n");
+      exit(1);
+    }
+    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
+    fclose(ff);
+
+    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
+      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
+    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
+
+    // calculate weights
+    calc_weights(&d);
+    
+  }
+
+  // test mode
+  FILE *fin, *fout;
+  uint64_t output_size;
+  char * output_data, * o1;
+  if (test) {
+
+    // read one block of input data    
+    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    for (int i=0;i<512;i++) {
+      fin = fopen(ftest,"rb");
+      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
+      fclose(fin);
+    }
+
+    // run correlator or beamformer, and output data
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+
+      fout = fopen("output.dat","wb");
+      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+      fclose(fout);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
+
+      /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
+      o1 = (char *)malloc(output_size);
+      cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/
+	
+      
+
+      fout = fopen("output.dat","wb");
+      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
+      //fwrite(o1,1,output_size,fout);
+      fclose(fout);
+    }
+
+	
+    // free
+    free(d.h_input);
+    free(output_data);
+    free(o1);
+    deallocate(&d,bf);
+
+    exit(1);
+  }
+  
+
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  if (bf==0) 
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
+  else
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
+  uint64_t  bytes_read = 0;
+  char * block;
+  char * output_buffer;
+  output_buffer = (char *)malloc(block_out);
+  uint64_t written, block_id;
+  
+  // get things started
+  bool observation_complete=0;
+  bool started = 0;
+  syslog(LOG_INFO, "starting observation");
+  int blocks = 0;
+  clock_t begin, end;
+  double time_spent;
+  
+  while (!observation_complete) {
+
+    if (DEBUG) syslog(LOG_INFO,"reading block");    
+    
+    // open block
+    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    // do stuff
+    //begin = clock();
+    // loop
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
+    }
+    //end = clock();
+    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
+    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
+    
+    // write to output
+
+    // write to host
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
+    blocks++;
+    // loop end
+    
+      
+    // finish up
+    if (bytes_read < block_size)
+      observation_complete = 1;
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+  }
+
+  // finish up
+  free(output_buffer);
+  deallocate(&d,bf);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_bigfake.c b/legacy/dsaX_bigfake.c
new file mode 100644
index 0000000..f5e1354
--- /dev/null
+++ b/legacy/dsaX_bigfake.c
@@ -0,0 +1,320 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f file to read packet from [default none]\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      useZ = 0;
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  uint64_t npackets = block_out / 4194304;
+  char * block, * output_buffer;
+  char * packet;
+  packet = (char *)malloc(sizeof(char)*4194304);
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // fill output buffer if file exists
+  FILE *fin;
+  if (!useZ) {
+
+    if (!(fin=fopen(fnam,"rb"))) {
+      syslog(LOG_ERR, "cannot open file - will write zeros");
+    }
+    else {
+
+      fread(packet,4194304,1,fin);
+      fclose(fin);
+
+      syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      
+      for (int i=0;i<npackets;i++)
+	memcpy(output_buffer+i*4194304,packet,4194304);
+
+      syslog(LOG_INFO, "Using input packet");
+      
+    }
+
+    
+  }
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+    // no need to do anything here - output_buffer is ready to go
+
+    // write to output
+    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(packet);
+  free(output_buffer);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_capture.c b/legacy/dsaX_capture.c
new file mode 100644
index 0000000..054e45d
--- /dev/null
+++ b/legacy/dsaX_capture.c
@@ -0,0 +1,1080 @@
+/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
+
+1;95;0cmain: runs capture loop, and interfaces dada buffer
+control_thread: deals with control commands
+
+*/
+
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+//#include "multilog.h"
+
+#define unhappies 3000
+#define skips 6
+#define sleeps 1.5
+
+/* global variables */
+int quit_threads = 0;
+char STATE[20];
+uint64_t UTC_START = 10000;
+uint64_t UTC_STOP = 40000000000;
+int MONITOR = 0;
+char iP[100];
+int DEBUG = 0;
+int HISTOGRAM[16];
+int cPort = CAPTURE_CONTROL_PORT;
+int dPort = CAPTURE_PORT;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_capture [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -j IP to listen on for data packets [no default]\n"
+	   " -p PORT to listen to for data packets [default 4011]\n"
+	   " -q PORT to listen to for control commands [default CAPTURE_CONTROL_PORT]\n"
+	   " -i IP to listen on for control commands [no default]\n"	
+	   " -f filename of template dada header [no default]\n"
+	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"	   
+	   " -d send debug messages to syslog\n"
+	   " -h print usage\n");
+}
+
+/*
+ * create a socket with the specified number of buffers
+ */
+dsaX_sock_t * dsaX_init_sock ()
+{
+  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
+  assert(b != NULL);
+
+  b->bufsz = sizeof(char) * UDP_PAYLOAD;
+
+  b->buf = (char *) malloc (b->bufsz);
+  assert(b->buf != NULL);
+
+  b->have_packet = 0;
+  b->fd = 0;
+
+  return b;
+}
+
+void dsaX_free_sock(dsaX_sock_t* b)
+{
+  b->fd = 0;
+  b->bufsz = 0;
+  b->have_packet =0;
+  if (b->buf)
+    free (b->buf);
+  b->buf = 0;
+}
+
+/* 
+ *  intialize UDP receiver resources
+ */
+int dsaX_udpdb_init_receiver (udpdb_t * ctx)
+{
+  syslog(LOG_INFO,"dsax_udpdb_init_receiver()");
+
+  // create a dsaX socket which can hold variable num of UDP packet
+  ctx->sock = dsaX_init_sock();
+
+  ctx->ooo_packets = 0;
+  ctx->recv_core = -1;
+  ctx->n_sleeps = 0;
+  ctx->mb_rcv_ps = 0;
+  ctx->mb_drp_ps = 0;
+  ctx->block_open = 0;
+  ctx->block_count = 0;
+  ctx->capture_started = 0;
+  ctx->last_seq = 0;
+  ctx->last_byte = 0;
+  ctx->block_start_byte = 0;
+
+  // allocate required memory strucutres
+  ctx->packets = init_stats_t();
+  ctx->bytes   = init_stats_t();
+
+  syslog(LOG_INFO,"receiver inited");
+  
+  return 0;
+}
+
+/* 
+prepare socket and writer
+*/
+
+int dsaX_udpdb_prepare (udpdb_t * ctx)
+{
+  syslog(LOG_INFO, "dsaX_udpdb_prepare()");
+
+  // open socket
+  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
+  ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose);
+  if (ctx->sock->fd < 0) {
+    syslog (LOG_ERR, "Error, Failed to create udp socket");
+    return -1;
+  }
+
+  
+  // set the socket size to 256 MB
+  int sock_buf_size = 4*1024*1024;
+  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
+  dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size);
+
+  // set the socket to non-blocking
+  syslog(LOG_INFO, "prepare: setting non_block");
+  sock_nonblock(ctx->sock->fd);
+
+  // clear any packets buffered by the kernel
+  syslog(LOG_INFO, "prepare: clearing packets at socket");
+  size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD);
+
+  // setup the next_seq to the initial value
+  //ctx->last_seq = 0;
+  //ctx->last_byte = 0;
+  //ctx->n_sleeps = 0;
+
+  return 0;
+}
+
+/*
+ *  reset receiver before an observation commences
+ */
+void dsaX_udpdb_reset_receiver (udpdb_t * ctx) 
+{
+  syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()");
+
+  ctx->capture_started = 0;
+  ctx->last_seq = 0;
+  ctx->last_byte = 0;
+  ctx->n_sleeps = 0;
+
+  reset_stats_t(ctx->packets);
+  reset_stats_t(ctx->bytes);
+}
+
+/* 
+ *  open a data block buffer ready for direct access
+ */
+int dsaX_udpdb_open_buffer (udpdb_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
+
+  if (ctx->block_open)
+  {
+    syslog (LOG_ERR, "open_buffer: buffer already opened");
+    return -1;
+  }
+
+  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
+
+  uint64_t block_id = 0;
+
+  ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
+  if (!ctx->block)
+  { 
+    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
+    return -1;
+  }
+
+  ctx->block_open = 1;
+  ctx->block_count = 0;
+
+  return 0;
+}
+
+/*
+ *  close a data buffer, assuming a full block has been written
+ */
+int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
+
+  if (!ctx->block_open)
+  { 
+    syslog (LOG_ERR, "close_buffer: buffer already closed");
+    return -1;
+  }
+
+  // log any buffers that are not full, except for the 1 byte "EOD" buffer
+  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
+    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
+              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
+              bytes_written, ctx->hdu_bufsz);
+
+  if (eod)
+  {
+    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
+      return -1;
+    }
+  }
+  else 
+  {
+    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
+      return -1;
+    }
+  }
+
+  ctx->block = 0;
+  ctx->block_open = 0;
+
+  return 0;
+}
+
+/* 
+ *  move to the next ring buffer element. return pointer to base address of new buffer
+ */
+int dsaX_udpdb_new_buffer (udpdb_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
+
+  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
+    return -1;
+  }
+
+  if (dsaX_udpdb_open_buffer (ctx) < 0) 
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+  // increment buffer byte markers
+  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
+  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
+
+  // set block to 0
+  //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte);
+  
+  if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", 
+             ctx->block_start_byte, ctx->block_end_byte);
+
+  return 0;
+
+}
+
+/* 
+ *  destroy UDP receiver resources 
+ */
+int dsaX_udpdb_destroy_receiver (udpdb_t * ctx)
+{
+  if (ctx->sock)
+    dsaX_free_sock(ctx->sock);
+  ctx->sock = 0;
+}
+
+/*
+ * Close the udp socket and file
+ */
+
+int udpdb_stop_function (udpdb_t* ctx)
+{
+
+  syslog(LOG_INFO, "stop: dada_hdu_unlock_write()");
+  if (dada_hdu_unlock_write (ctx->hdu) < 0)
+  {
+    syslog (LOG_ERR, "stop: could not unlock write on");
+    return -1;
+  }
+
+  // close the UDP socket
+  close(ctx->sock->fd);
+
+  if (ctx->packets->dropped)
+  {
+    double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte;
+    percent *= 100;
+
+    syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %",
+             ctx->bytes->dropped, ctx->last_byte, percent);
+  }
+
+  return 0;
+}
+
+
+
+
+/* --------- THREADS -------- */
+
+// STATS THREAD
+
+/* 
+ *  Thread to print simple capture statistics
+ */
+void stats_thread(void * arg) {
+
+  /*  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = 4;
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+  */
+  
+  udpdb_t * ctx = (udpdb_t *) arg;
+  uint64_t b_rcv_total = 0;
+  uint64_t b_rcv_1sec = 0;
+  uint64_t b_rcv_curr = 0;
+
+  uint64_t b_drp_total = 0;
+  uint64_t b_drp_1sec = 0;
+  uint64_t b_drp_curr = 0;
+
+  uint64_t s_rcv_total = 0;
+  uint64_t s_rcv_1sec = 0;
+  uint64_t s_rcv_curr = 0;
+
+  uint64_t ooo_pkts = 0;
+  float gb_rcv_ps = 0;
+  float mb_rcv_ps = 0;
+  float mb_drp_ps = 0;
+
+  syslog(LOG_INFO,"stats_thread: starting loop");
+  
+  while (!quit_threads)
+  {
+
+    /* get a snapshot of the data as quickly as possible */
+    b_rcv_curr = ctx->bytes->received;
+    b_drp_curr = ctx->bytes->dropped;
+    s_rcv_curr = ctx->n_sleeps;
+    
+    /* calc the values for the last second */
+    b_rcv_1sec = b_rcv_curr - b_rcv_total;
+    b_drp_1sec = b_drp_curr - b_drp_total;
+    s_rcv_1sec = s_rcv_curr - s_rcv_total;
+
+    /* update the totals */
+    b_rcv_total = b_rcv_curr;
+    b_drp_total = b_drp_curr;
+    s_rcv_total = s_rcv_curr;
+
+    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
+    mb_drp_ps = (double) b_drp_1sec / 1000000;
+    gb_rcv_ps = b_rcv_1sec * 8;
+    gb_rcv_ps /= 1000000000;
+
+    /* determine how much memory is free in the receivers */
+    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped 0", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq);
+
+    sleep(1);
+  }
+
+}
+
+
+
+
+
+
+
+// CONTROL THREAD
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = cPort;
+  char sport[10];
+  sprintf(sport,"%d",port);
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+
+    // INTERPRET BUFFER STRING
+    // receive either UTC_START, UTC_STOP, MONITOR
+
+    // interpret buffer string
+    char * rest = buffer;
+    char *cmd, *val;
+    cmd = strtok_r(rest, "-", &rest);
+    val = strtok_r(rest, "-", &rest);
+    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
+
+    if (strcmp(cmd,"UTC_START")==0)
+      UTC_START = strtoull(val,&endptr,0);
+
+    if (strcmp(cmd,"UTC_STOP")==0)
+      UTC_STOP = strtoull(val,&endptr,0);    
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+	    
+// MAIN of program
+	
+int main (int argc, char *argv[]) {
+
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_capture", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit for writing */
+  dada_hdu_t* hdu_out = 0;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // input data block HDU key
+  key_t out_key = CAPTURE_BLOCK_KEY;
+
+  // command line arguments
+  int core = -1;
+  int arg=0;
+  char dada_fnam[200]; // filename for dada header
+  char iface[100]; // IP for data packets
+  
+  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {	      
+	      strcpy(iP,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'p':
+	  if (optarg)
+	    {	      
+	      dPort = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-p flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'q':
+	  if (optarg)
+	    {	      
+	      cPort = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-q flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {	      
+	      strcpy(iface,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'f':
+	  if (optarg)
+	    {	      
+	      strcpy(dada_fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	 
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // record STATE info
+  sprintf(STATE,"NOBUFFER");
+
+  // START THREADS
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id, stats_thread_id;
+  if (DEBUG)
+    syslog (LOG_DEBUG, "Creating threads");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort);
+
+  // start the stats thread
+  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started stats_thread()");
+
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  // initialize the data structure
+  syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()");
+  if (dsaX_udpdb_init_receiver (&udpdb) < 0)
+  {
+    syslog (LOG_ERR, "could not initialize receiver");
+    return EXIT_FAILURE;
+  }
+  
+  
+  // OPEN CONNECTION TO DADA DB FOR WRITING
+
+  if (DEBUG) syslog(LOG_INFO,"Creating HDU");
+  
+  hdu_out  = dada_hdu_create (0);
+  if (DEBUG) syslog(LOG_INFO,"Created hdu");
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog(LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (DEBUG) syslog(LOG_INFO,"Connected HDU");
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_out);
+    syslog(LOG_ERR,"could not lock to output dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  syslog(LOG_INFO,"opened connection to output DB");
+
+  // DEAL WITH DADA HEADER
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+  if (DEBUG) syslog(LOG_INFO,"read header2");
+
+  if (fileread (dada_fnam, hout, 4096) < 0)
+    {
+      free (hout);
+      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
+      return (EXIT_FAILURE);
+    }
+
+  
+  if (DEBUG) syslog(LOG_INFO,"read header3");
+
+  
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+
+  
+  // copy the in header to the out header
+  memcpy (header_out, hout, 4096);
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
+    {
+      syslog(LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  sprintf(STATE,"LISTEN");
+  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
+
+
+  /* time to start up receiver. 
+     data are captured on iface:CAPTURE_PORT 
+  */
+
+  
+  // put information in udpdb struct
+  udpdb.hdu = hdu_out;
+  udpdb.port = dPort;
+  udpdb.interface = strdup(iface);
+  udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  // determine number of packets per block, must 
+  if (udpdb.hdu_bufsz % UDP_DATA != 0)
+  {
+    syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA);
+    return EXIT_FAILURE;
+  }
+  udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA;  
+  udpdb.bytes_to_acquire = 0;
+  udpdb.num_inputs = NSNAPS;
+
+  // prepare the socket
+  syslog(LOG_INFO, "main: dsaX_udpdb_prepare()");
+  if (dsaX_udpdb_prepare (&udpdb) < 0)
+  {
+    syslog(LOG_ERR, "could allocate required resources (prepare)");
+    return EXIT_FAILURE;
+  }
+  
+  // reset the receiver
+  syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()");
+  dsaX_udpdb_reset_receiver (&udpdb);
+
+  // open a block of the data block, ready for writing
+  if (dsaX_udpdb_open_buffer (&udpdb) < 0)
+  {
+    syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+  
+  /* START WHAT WAS in RECV THREAD */
+
+  // DEFINITIONS
+
+  // lookup table for ant order
+  uint64_t ant_lookup[100], vv;
+  for (int i=0;i<100;i++) ant_lookup[i] = 0;
+  for (int i=0;i<NSNAPS/2;i++) {
+    for (int j=0;j<2;j++) {
+      vv = (i*2+j)*3;
+      ant_lookup[vv] = (uint64_t)(i);
+    }
+  }
+  
+  int unhappies_ct = 0;
+  int unhappy = 0;
+  uint64_t act_seq_no = 0;
+  uint64_t block_seq_no = 0;
+  uint64_t seq_no = 0;
+  uint64_t ch_id = 0;
+  uint64_t ant_id = 0, aid;
+  unsigned char * b = (unsigned char *) udpdb.sock->buf;
+  size_t got = 0; // data received from a recv_from call
+  int errsv; // determine the sequence number boundaries for curr and next buffers
+  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
+  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
+  // for "saving" out of order packets near edges of blocks
+  unsigned int temp_idx = 0;
+  unsigned int temp_max = 1000;
+  char ** temp_buffers; //[temp_max][UDP_DATA];
+  uint64_t * temp_seq_byte;
+  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
+  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
+  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
+  unsigned i = 0;
+  uint64_t timeouts = 0;
+  uint64_t timeout_max = 1000000000;
+  int canWrite = 0;
+  int ct_snaps=0;
+
+  // infinite loop to receive packets
+  // use stats thread to monitor STATE at this stage, to save resources here
+
+  while (1)
+    {
+
+      udpdb.sock->have_packet = 0; 
+
+      // incredibly tight loop to try and get a packet
+      while (!udpdb.sock->have_packet)
+	{
+	 
+	  // receive 1 packet into the socket buffer
+	  got = recvfrom ( udpdb.sock->fd, udpdb.sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
+
+	  if (got == UDP_PAYLOAD) 
+	    {
+	      udpdb.sock->have_packet = 1;
+	    } 
+	  else if (got == -1) 
+	    {
+	      errsv = errno;
+	      if (errsv == EAGAIN) 
+		{
+		  udpdb.n_sleeps++;
+		  if (udpdb.capture_started)
+		    timeouts++;
+		  if (timeouts > timeout_max)
+		    syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
+		}
+	      else 
+		{
+		  syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
+		  return EXIT_FAILURE;
+		}
+	    } 
+	  else // we received a packet of the WRONG size, ignore it
+	    {
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
+	    }
+	}
+      timeouts = 0;
+
+      // we have a valid packet within the timeout
+      if (udpdb.sock->have_packet) 
+	{
+
+	  // decode packet header (64 bits)
+	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
+	  seq_no = 0;
+	  seq_no |=  (((uint64_t)(udpdb.sock->buf[4]) & 224) >> 5) & 7;
+	  //seq_no &= 7;
+	  seq_no |=  (((uint64_t)(udpdb.sock->buf[3])) << 3) & 2040;
+	  //seq_no &= 2047;
+	  seq_no |=  (((uint64_t)(udpdb.sock->buf[2])) << 11) & 522240;
+	  //seq_no &= 524287;
+	  seq_no |=  (((uint64_t)(udpdb.sock->buf[1])) << 19) & 133693440;
+	  //seq_no &= 134217727;
+	  seq_no |=  (((uint64_t)(udpdb.sock->buf[0])) << 27) & 34225520640;
+	  //seq_no &= 34359738367;
+	  /*seq_no = 0;
+	  seq_no |= 224 >> 5;
+	  seq_no |= 255 << 3;
+	  seq_no |= 255 << 11;
+	  seq_no |= 255 << 19;*/
+	  
+	  /*ch_id = 0;
+	  ch_id |= ((unsigned char) (udpdb.sock->buf[4]) & 31) << 8;
+	  ch_id |= (unsigned char) (udpdb.sock->buf[5]);*/
+
+	  ant_id = 0;
+	  ant_id |= (unsigned char) (udpdb.sock->buf[6]) << 8;
+	  ant_id |= (unsigned char) (udpdb.sock->buf[7]);
+	  aid = ant_lookup[(int)(ant_id)];
+
+	  if (UTC_START==0) UTC_START = seq_no + 10000;
+	  
+	  //act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3 + (ch_id-CHOFF)/384; // actual seq no
+	  act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no
+	  block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block
+
+	  // check for starting or stopping condition, using continue
+	  //if (DEBUG) printf("%"PRIu64" %"PRIu64" %d\n",seq_no,act_seq_no,ch_id);//syslog(LOG_DEBUG, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id);
+	  //if (seq_no == UTC_START && UTC_START != 10000 && ant_id == 0) canWrite=1;
+	  if (canWrite==0) {
+	    if (seq_no >= UTC_START-5 && UTC_START != 10000) ct_snaps++;
+	    if (ct_snaps >= 32) canWrite=1;
+	  }
+	  //if (seq_no > UTC_START && UTC_START != 10000) canWrite=1;	  
+	  udpdb.last_seq = seq_no;
+	  //syslog(LOG_INFO,"SEQ_NO_DBG %"PRIu64"",seq_no);
+	  if (canWrite == 0) continue;
+	  //if (seq_no == UTC_STOP) canWrite=0;
+	  //if (udpdb.packets->received<100) syslog(LOG_INFO, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id);
+	  
+	  // if first packet
+	  if (!udpdb.capture_started)
+	    {
+	      //udpdb.block_start_byte = act_seq_no * UDP_DATA;
+	      udpdb.block_start_byte = block_seq_no * UDP_DATA;
+	      udpdb.block_end_byte   = (udpdb.block_start_byte + udpdb.hdu_bufsz) - UDP_DATA;
+	      udpdb.capture_started = 1;
+
+	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb.block_start_byte, udpdb.block_end_byte);
+	    }
+
+	  // if capture running
+	  if (udpdb.capture_started)
+	    {
+	      seq_byte = (act_seq_no * UDP_DATA);	      
+
+	      udpdb.last_byte = seq_byte;
+	      
+	      // if packet arrived too late, ignore
+	      if (seq_byte < udpdb.block_start_byte)
+		{
+		  //syslog (LOG_INFO, "receive_obs: seq_byte < block_start_byte: %"PRIu64", %"PRIu64"", seq_no, ant_id);
+		  udpdb.packets->dropped++;
+		  udpdb.bytes->dropped += UDP_DATA;
+		}
+	      else
+		{
+		  // packet belongs in this block
+		  if (seq_byte <= udpdb.block_end_byte)
+		    {
+		      byte_offset = seq_byte - udpdb.block_start_byte;
+		      memcpy (udpdb.block + byte_offset, udpdb.sock->buf + UDP_HEADER, UDP_DATA);
+		      udpdb.packets->received++;
+		      udpdb.bytes->received += UDP_DATA;
+		      udpdb.block_count++;
+		    }
+		  // packet belongs in subsequent block
+		  else
+		    {
+		      //syslog (LOG_INFO, "receive_obs: received packet for subsequent buffer: temp_idx=%d, ant_id=%d, seq_no=%"PRIu64"",temp_idx,ant_id,seq_no);
+		      
+		      if (temp_idx < temp_max)
+			{
+			  // save packet to temp buffer
+			  memcpy (temp_buffers[temp_idx], udpdb.sock->buf + UDP_HEADER, UDP_DATA);
+			  temp_seq_byte[temp_idx] = seq_byte;
+			  temp_idx++;
+			}
+		      else
+			{
+			  udpdb.packets->dropped++;
+			  udpdb.bytes->dropped += UDP_DATA;
+			}
+		    }
+		}
+	    }
+
+	  // now check for a full buffer or full temp queue
+	  if ((udpdb.block_count >= udpdb.packets_per_buffer) || (temp_idx >= temp_max))
+	    {
+	      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
+		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
+		      "temp_idx=%d\n", seq_no, ant_id,  udpdb.block_count, 
+		      temp_idx);
+	      
+	      uint64_t dropped = udpdb.packets_per_buffer - udpdb.block_count;
+	      if (dropped)
+		{
+		  udpdb.packets->dropped += dropped;
+		  udpdb.bytes->dropped += (dropped * UDP_DATA);
+		}
+
+	      if (dropped>1000) unhappies_ct++;
+
+	      // get a new buffer and write any temp packets saved 
+	      if (dsaX_udpdb_new_buffer (&udpdb) < 0)
+		{
+		  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+		  return EXIT_FAILURE;
+		}
+
+	      if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb.block_start_byte, udpdb.block_end_byte);
+  
+	      // include any futuristic packets we saved
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - udpdb.block_start_byte;
+		  if (byte_offset < udpdb.hdu_bufsz)
+		    {
+		      memcpy (udpdb.block + byte_offset, temp_buffers[i], UDP_DATA);
+		      udpdb.block_count++;
+		      udpdb.packets->received++;
+		      udpdb.bytes->received += UDP_DATA;
+		    }
+		  else
+		    {
+		      udpdb.packets->dropped++;
+		      udpdb.bytes->dropped += UDP_DATA;
+		    }
+		}
+	      temp_idx = 0;
+	    }
+	}
+
+      // packet has been inserted or saved by this point
+      udpdb.sock->have_packet = 0;
+
+      // deal with unhappy receiver
+      if (unhappies_ct > unhappies) {
+
+	syslog(LOG_INFO, "Skipping some blocks...");
+
+	close(udpdb.sock->fd);
+
+	for (int i=0;i<skips;i++) {
+
+	  udpdb.packets->dropped += udpdb.packets_per_buffer;
+	  udpdb.bytes->dropped += (udpdb.packets_per_buffer * UDP_DATA);
+
+	  if (dsaX_udpdb_new_buffer (&udpdb) < 0)
+	    {
+	      syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+	      return EXIT_FAILURE;
+	    }
+
+	}
+
+	sleep(sleeps);
+
+	// prepare the socket
+	syslog(LOG_INFO, "re-preparing the socket dsaX_udpdb_prepare()");
+	if (dsaX_udpdb_prepare (&udpdb) < 0)
+	  {
+	    syslog(LOG_ERR, "could allocate required resources (prepare)");
+	    return EXIT_FAILURE;
+	  }	
+	
+	unhappies_ct = 0;
+
+      }
+      
+    }
+
+  /* END WHAT WAS IN RECV THREAD */
+  
+
+  // close threads
+  syslog(LOG_INFO, "joining control_thread and stats_thread");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  pthread_join (stats_thread_id, &result);
+
+  free(temp_seq_byte);
+  free(temp_buffers);
+  
+  dsaX_dbgpu_cleanup (hdu_out);
+
+}
diff --git a/legacy/dsaX_capture.h b/legacy/dsaX_capture.h
new file mode 100644
index 0000000..58355f8
--- /dev/null
+++ b/legacy/dsaX_capture.h
@@ -0,0 +1,131 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#ifndef __DSAX_UDPDB_THREAD_H
+#define __DSAX_UDPDB_THREAD_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+#define DSAX_UDPDB_BUF_CLEAR = 0
+#define DSAX_UDPDB_BUF_FULL = 1
+
+/* socket buffer for receiving udp data */
+typedef struct {
+
+  int           fd;            // FD of the socket
+  size_t        bufsz;         // size of socket buffer
+  char *        buf;          // the socket buffer
+  int           have_packet;   // 
+  size_t        got;           // amount of data received
+
+} dsaX_sock_t;
+
+dsaX_sock_t * dsax_Xnit_sock ();
+
+void dsaX_free_sock(dsaX_sock_t* b);
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  multilog_t *      log;                // DADA logging interface
+  int               verbose;            // verbosity flag 
+
+  dsaX_sock_t *     sock;               // UDP socket for data capture
+  int               port;               // port to receive UDP data 
+  int               control_port;       // port to receive control commands
+  char *            interface;          // IP Address to accept packets on 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+
+  // datablock management
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * block;             // pointer to current datablock buffer
+  uint64_t          block_start_byte;  // seq_byte of first byte for the block
+  uint64_t          block_end_byte;    // seq_byte of first byte of final packet of the block
+  uint64_t          block_count;       // number of packets in this block
+  char            * tblock;            // area of memory to write to
+  
+  // packets
+  unsigned          capture_started;      // flag for start of UDP data
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+
+  uint64_t bytes_to_acquire;
+  double mb_rcv_ps;
+  double mb_drp_ps;
+  double mb_free;
+  double mb_total;
+  uint64_t rcv_sleeps;
+
+  uint64_t last_seq;                     // most recently received seq number
+  uint64_t last_byte;                    // most recently received byte
+  struct   timeval timeout; 
+
+  uint64_t n_sleeps;
+  uint64_t ooo_packets;
+
+  int      recv_core;
+
+} udpdb_t;
+
+
+int dsaX_udpdb_init_receiver (udpdb_t * ctx);
+void dsaX_udpdb_reset_receiver (udpdb_t * ctx);
+int dsaX_udpdb_destroy_receiver (udpdb_t * ctx);
+int dsaX_udpdb_open_buffer (udpdb_t * ctx);
+int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod);
+int dsaX_udpdb_new_buffer (udpdb_t * ctx);
+int dsaX_udpdb_increment (udpdb_t * ctx);
+
+// allocate required resources for data capture
+int dsaX_udpdb_prepare (udpdb_t * ctx);
+
+// move to a state where data acquisition can begin
+time_t dsaX_dpdb_start (udpdb_t * ctx, char * header);
+
+// main workhorse function to receive data for a single observation
+void * dsaX_udpdb_receive_obs (void * ctx);
+
+// close the datablock signifying end of data 
+int udpdb_stop_function (udpdb_t* ctx);
+
+void usage();
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
+
+#endif
diff --git a/legacy/dsaX_capture_manythread.c b/legacy/dsaX_capture_manythread.c
new file mode 100644
index 0000000..b9f14bd
--- /dev/null
+++ b/legacy/dsaX_capture_manythread.c
@@ -0,0 +1,1115 @@
+/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
+
+main: runs capture loop, and interfaces dada buffer
+control_thread: deals with control commands
+
+*/
+
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture_manythread.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int dPort, cPort;
+int quit_threads = 0;
+char STATE[20];
+uint64_t UTC_START = 10000;
+uint64_t UTC_STOP = 40000000000;
+int MONITOR = 0;
+char iP[100];
+int DEBUG = 0;
+int HISTOGRAM[16];
+int writeBlock = 0;
+const int nth = 4;
+const int nwth = 2;
+int cores[16] = {10,12,11,13,30,31,32,33};
+int write_cores[8] = {14,15,34,35};
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+volatile int blockStatus[64];
+volatile int skipBlock = 0;
+volatile int skipping = 0;
+volatile int lWriteBlock = 0;
+volatile int write_ct = 0;
+volatile uint64_t last_seq = 0;
+volatile int skipct = 0;
+volatile uint64_t block_count = 0;
+volatile uint64_t block_start_byte=0, block_end_byte=0;
+volatile  unsigned capture_started = 0;
+volatile char * wblock;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+void usage();
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_capture [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -j IP to listen on for data packets [no default]\n"
+	   " -i IP to listen on for control commands [no default]\n"
+	   " -p PORT for data\n"
+	   " -q PORT for control\n"
+	   " -f filename of template dada header [no default]\n"
+	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -d send debug messages to syslog\n"
+	   " -g chgroup [default 0]\n"
+	   " -h print usage\n");
+}
+
+// open a socket
+dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx);
+dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx)
+{
+
+  // prepare structure
+  syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure");
+  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
+  assert(b != NULL);
+  b->bufsz = sizeof(char) * UDP_PAYLOAD;
+  b->buf = (char *) malloc (b->bufsz);
+  assert(b->buf != NULL);
+  b->have_packet = 0;
+  b->fd = 0;
+
+  // connect to socket
+  syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, dPort);
+
+  // open socket
+  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, dPort);
+  b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  assert(b->fd>=0);
+
+  // for multiple connections
+  int one = 1;
+  setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one));
+  
+  struct sockaddr_in udp_sock;
+  bzero(&(udp_sock.sin_zero), 8);                     // clear the struct
+  udp_sock.sin_family = AF_INET;                      // internet/IP
+  udp_sock.sin_port = htons(dPort);                    // set the port number
+  udp_sock.sin_addr.s_addr = inet_addr(ctx->interface);  // from a specific IP address 
+
+  if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) {
+    syslog(LOG_ERR, "prepare: failed to bind to socket");
+    return -1;
+  }
+  
+  // set the socket size to 64 MB
+  int sock_buf_size = 64*1024*1024;
+  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
+  dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size);
+
+  // set the socket to non-blocking
+  syslog(LOG_INFO, "prepare: setting non_block");
+  sock_nonblock(b->fd);
+
+  // clear any packets buffered by the kernel
+  syslog(LOG_INFO, "prepare: clearing packets at socket");
+  size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD);
+
+  // clear blockStatus
+  for (int i=0;i<64;i++) blockStatus[i] = 0;
+
+  return b;
+}
+
+
+
+// close a socket
+void dsaX_free_sock(dsaX_sock_t* b);
+void dsaX_free_sock(dsaX_sock_t* b)
+{
+  b->fd = 0;
+  b->bufsz = 0;
+  b->have_packet =0;
+  if (b->buf)
+    free (b->buf);
+  b->buf = 0;
+}
+
+/* 
+ *  open a data block buffer ready for direct access
+ */
+int dsaX_udpdb_open_buffer (dsaX_write_t * ctx);
+int dsaX_udpdb_open_buffer (dsaX_write_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
+
+  if (ctx->block_open)
+  {
+    syslog (LOG_ERR, "open_buffer: buffer already opened");
+    return -1;
+  }
+
+  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
+
+  uint64_t block_id = 0;
+
+  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
+  if (!wblock)
+  { 
+    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
+    return -1;
+  }
+
+  ctx->block_open = 1;
+
+  return 0;
+}
+
+/*
+ *  close a data buffer, assuming a full block has been written
+ */
+int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod);
+int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
+
+  if (!ctx->block_open)
+  { 
+    syslog (LOG_ERR, "close_buffer: buffer already closed");
+    return -1;
+  }
+
+  // log any buffers that are not full, except for the 1 byte "EOD" buffer
+  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
+    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
+              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
+              bytes_written, ctx->hdu_bufsz);
+
+  if (eod)
+  {
+    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
+      return -1;
+    }
+  }
+  else 
+  {
+    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
+      return -1;
+    }
+  }
+
+  wblock = 0;
+  ctx->block_open = 0;
+
+  return 0;
+}
+
+/* 
+ *  move to the next ring buffer element. return pointer to base address of new buffer
+ */
+int dsaX_udpdb_new_buffer (dsaX_write_t * ctx);
+int dsaX_udpdb_new_buffer (dsaX_write_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
+
+  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
+    return -1;
+  }
+
+  if (dsaX_udpdb_open_buffer (ctx) < 0) 
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+  return 0;
+
+}
+
+// increment counters when block is full
+void dsaX_udpdb_increment (udpdb_t * ctx);
+void dsaX_udpdb_increment (udpdb_t * ctx)
+{
+
+  // increment buffer byte markers
+  writeBlock++;
+  block_start_byte = block_end_byte + UDP_DATA;
+  block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
+  block_count = 0;
+
+}
+
+
+
+/* --------- THREADS -------- */
+
+// STATS THREAD
+
+/* 
+ *  Thread to print simple capture statistics
+ */
+void stats_thread(void * arg) {
+  
+  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
+  uint64_t b_rcv_total = 0;
+  uint64_t b_rcv_1sec = 0;
+  uint64_t b_rcv_curr = 0;
+
+  uint64_t b_drp_total = 0;
+  uint64_t b_drp_1sec = 0;
+  uint64_t b_drp_curr = 0;
+
+  uint64_t s_rcv_total = 0;
+  uint64_t s_rcv_1sec = 0;
+  uint64_t s_rcv_curr = 0;
+
+  uint64_t ooo_pkts = 0;
+  float gb_rcv_ps = 0;
+  float mb_rcv_ps = 0;
+  float mb_drp_ps = 0;
+
+  syslog(LOG_INFO,"starting stats thread...");
+  sleep(2);
+  syslog(LOG_INFO,"started stats thread...");
+  
+  while (!quit_threads)
+  {
+
+    /* get a snapshot of the data as quickly as possible */
+    b_rcv_curr = ctx->bytes->received;
+    b_drp_curr = ctx->bytes->dropped;
+    
+    /* calc the values for the last second */
+    b_rcv_1sec = b_rcv_curr - b_rcv_total;
+    b_drp_1sec = b_drp_curr - b_drp_total;
+
+    /* update the totals */
+    b_rcv_total = b_rcv_curr;
+    b_drp_total = b_drp_curr;
+
+    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
+    mb_drp_ps = (double) b_drp_1sec / 1000000;
+    gb_rcv_ps = b_rcv_1sec * 8;
+    gb_rcv_ps /= 1000000000;    
+
+    /* determine how much memory is free in the receivers */
+    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct);
+
+    sleep(1);
+  }
+
+}
+
+// CONTROL THREAD
+
+void control_thread (void * arg) {
+
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = cPort;
+  char sport[10];
+  sprintf(sport,"%d",port);
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+
+    // INTERPRET BUFFER STRING
+    // receive either UTC_START, UTC_STOP, MONITOR
+
+    // interpret buffer string
+    char * rest = buffer;
+    char *cmd, *val;
+    cmd = strtok_r(rest, "-", &rest);
+    val = strtok_r(rest, "-", &rest);
+    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
+
+    if (strcmp(cmd,"UTC_START")==0)
+      UTC_START = strtoull(val,&endptr,0);
+
+    if (strcmp(cmd,"UTC_STOP")==0)
+      UTC_STOP = strtoull(val,&endptr,0);    
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+
+/* 
+ *  Thread to capture data
+ */
+int recv_thread(void * arg) {
+
+  udpdb_t * udpdb = (udpdb_t *) arg;
+  int thread_id = udpdb->thread_id;
+    
+  // set affinity
+  const pthread_t pid = pthread_self();
+  int core_id;
+  if (dPort==4011)
+    core_id = cores[thread_id];
+  else
+    core_id = cores[thread_id+nth];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+
+  // set up socket
+  dsaX_sock_t * sock = dsaX_make_sock(udpdb);
+
+    // lookup table for ant order
+  uint64_t ant_lookup[100], vv;
+  for (int i=0;i<100;i++) ant_lookup[i] = 0;
+  for (int i=0;i<NSNAPS/2;i++) {
+    for (int j=0;j<2;j++) {
+      vv = (i*2+j)*3;
+      ant_lookup[vv] = (uint64_t)(i);
+    }
+  }
+
+  
+  // DEFINITIONS
+  uint64_t tpack = 0;
+  uint64_t act_seq_no = 0;
+  uint64_t block_seq_no = 0;
+  uint64_t seq_no = 0;
+  uint64_t ant_id = 0, aid;
+  unsigned char * b = (unsigned char *) sock->buf;
+  size_t got = 0; // data received from a recv_from call
+  int errsv; // determine the sequence number boundaries for curr and next buffers
+  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
+  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
+  // for "saving" out of order packets near edges of blocks
+  unsigned int temp_idx = 0;
+  unsigned int temp_max = 500;
+  char ** temp_buffers;
+  uint64_t * temp_seq_byte;
+  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
+  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
+  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
+  unsigned i = 0;
+  uint64_t timeouts = 0;
+  uint64_t timeout_max = 1000000000;
+  int canWrite = 0;
+  int ct_snaps=0;
+  int mod_WB;
+  int ctAnts = 0;
+
+  // infinite loop to receive packets
+
+  while (!quit_threads)
+    {
+
+      sock->have_packet = 0; 
+
+      // incredibly tight loop to try and get a packet
+      while (!sock->have_packet)
+	{
+	 
+	  // receive 1 packet into the socket buffer
+	  got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
+
+	  if (got == UDP_PAYLOAD) 
+	    {
+	      sock->have_packet = 1;
+	    } 
+	  else if (got == -1) 
+	    {
+	      errsv = errno;
+	      if (errsv == EAGAIN) 
+		{
+		  if (capture_started)
+		    timeouts++;
+		  //if (timeouts > timeout_max)
+		  //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
+		}
+	      else 
+		{
+		  //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
+		  return EXIT_FAILURE;
+		}
+	    } 
+	  else // we received a packet of the WRONG size, ignore it
+	    {
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
+	    }
+	}
+      timeouts = 0;
+
+      // we have a valid packet within the timeout
+      if (sock->have_packet) 
+	{
+
+	  // decode packet header (64 bits)
+	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
+	  seq_no = 0;
+	  seq_no |=  (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7;
+	  seq_no |=  (((uint64_t)(sock->buf[3])) << 3) & 2040;
+	  seq_no |=  (((uint64_t)(sock->buf[2])) << 11) & 522240;
+	  seq_no |=  (((uint64_t)(sock->buf[1])) << 19) & 133693440;
+	  seq_no |=  (((uint64_t)(sock->buf[0])) << 27) & 34225520640;
+	  ant_id = 0;
+	  ant_id |= (unsigned char) (sock->buf[6]) << 8;
+	  ant_id |= (unsigned char) (sock->buf[7]);
+	  aid = ant_lookup[(int)(ant_id)];
+	  //aid = ant_id/3;
+	  
+	  if (UTC_START==0) UTC_START = seq_no+30000;
+	  
+	  act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no
+	  block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block
+
+	  // set shared last_seq
+	  pthread_mutex_lock(&mutex);
+	  last_seq = seq_no;
+	  //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq);
+	  pthread_mutex_unlock(&mutex);
+	  
+	  // check for starting or stopping condition, using continue
+	  if (canWrite==0) {
+	    if (seq_no >= UTC_START-50 && UTC_START != 10000) {
+	      canWrite=1;	      
+	    }
+	  }
+	  if (canWrite == 0) continue;
+
+	  // threadsafe start of capture
+	  pthread_mutex_lock(&mutex);
+	  if (!(capture_started))
+	    {
+	      block_start_byte = block_seq_no * UDP_DATA;
+	      block_end_byte   = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
+	      capture_started = 1;
+
+	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte);
+	    }
+	  pthread_mutex_unlock(&mutex);
+
+	  // if capture running
+	  if (capture_started)
+	    {
+	      seq_byte = (act_seq_no * UDP_DATA);
+	      tpack++;
+	      
+	      // packet belongs in this block
+	      if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte))
+		{
+		  byte_offset = seq_byte - (block_start_byte);
+		  mod_WB = writeBlock % 64;
+		  memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA);		  
+		  pthread_mutex_lock(&mutex);		  
+		  block_count++;
+		  //syslog(LOG_INFO,"block count %"PRIu64"",block_count);
+		  pthread_mutex_unlock(&mutex);
+		  
+		}
+	      // packet belongs in subsequent block
+	      else if (seq_byte > block_end_byte)
+		{
+		      
+		  if (temp_idx < temp_max)
+		    {
+		      // save packet to temp buffer
+		      memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA);
+		      temp_seq_byte[temp_idx] = seq_byte;
+		      temp_idx++;
+		    }
+		}
+	      // packet is too late
+	      /*else
+		{
+		  if (ctAnts<100) {
+		    syslog (LOG_INFO, "receive_obs: TOO LATE %"PRIu64"  %"PRIu64"", seq_no, ant_id);
+		    ctAnts++;
+		  }
+		  }*/
+	    }
+	  
+	  // threadsafe end of block
+	  pthread_mutex_lock(&mutex);
+	  if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
+	    {
+	      syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", "
+		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
+		      "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id,  block_count, 
+		      temp_idx,writeBlock);
+
+	      // write block
+	      // check whether doWrite has been released. If not, skip this block
+	      if (blockStatus[writeBlock % 64] > 0)
+		blockStatus[writeBlock % 64] += 1;
+	      else
+		blockStatus[writeBlock % 64] = 1;
+	      
+	      uint64_t dropped = udpdb->packets_per_buffer - (block_count);
+	      udpdb->packets->received += (block_count);
+	      udpdb->bytes->received += (block_count) * UDP_DATA;	      
+	      if (dropped)
+		{
+		  udpdb->packets->dropped += dropped;
+		  udpdb->bytes->dropped += (dropped * UDP_DATA);
+		}
+
+	      // increment counters
+	      dsaX_udpdb_increment(udpdb);
+	      ctAnts = 0;
+
+	      // write temp queue for this thread
+	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
+	      tpack = 0;
+	
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - (block_start_byte);
+		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
+		    {
+		      mod_WB = writeBlock % 64;
+		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
+		      //pthread_mutex_lock(&mutex);
+		      block_count++;		      
+		      //pthread_mutex_unlock(&mutex);
+		    }
+		}
+	      temp_idx = 0;
+       
+	    }
+	  pthread_mutex_unlock(&mutex);
+
+	  // at this stage, can try and write temp queue safely for other threads
+	  if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0)
+	    {
+	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
+	      tpack = 0;
+	
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - (block_start_byte);
+		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
+		    {
+		      mod_WB = writeBlock % 64;
+		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
+		      pthread_mutex_lock(&mutex);
+		      block_count++;		      
+		      pthread_mutex_unlock(&mutex);
+		    }
+		}
+	      temp_idx = 0;
+
+	    }
+
+	}
+
+      // packet has been inserted or saved by this point
+      sock->have_packet = 0;
+	
+    }
+
+  dsaX_free_sock(sock);
+  free(temp_buffers);
+  free(temp_seq_byte);
+  
+}
+
+/* 
+ *  Thread to write data
+ */
+void write_thread(void * arg) {
+
+  dsaX_write_t * udpdb = (dsaX_write_t *) arg;
+  int thread_id = udpdb->thread_id;
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  int core_id;
+  if (dPort==4011)
+    core_id = write_cores[thread_id];
+  else
+    core_id = write_cores[thread_id+nwth];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+    
+  int mod_WB = 0;
+  int a;
+  
+  while (!quit_threads)
+  {
+
+    mod_WB = lWriteBlock % 64;
+    
+    while (blockStatus[mod_WB]==0) {
+      a=1;
+    }    
+
+    // assume everything is set up
+    // wblock is assigned, write_ct=0
+        
+    memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz  + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth);
+
+    pthread_mutex_lock(&mutex);
+    write_ct++;
+    pthread_mutex_unlock(&mutex);
+
+    //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id);
+
+    // now wait until thread 0 has finished getting a new block before moving on
+    if (thread_id>0) {
+      while (write_ct!=0) a=1;
+    }
+    else {
+
+      // wait for all sub-blocks to be written
+      while (write_ct<nwth) a=1;
+
+      // get new block
+      if (dsaX_udpdb_new_buffer (udpdb) < 0)
+	{
+	  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+	  return EXIT_FAILURE;
+	}
+
+      syslog(LOG_INFO,"write thread %d: written block... %d",thread_id,lWriteBlock);
+      lWriteBlock++;
+      
+      // update doWrite and skipBlock
+      skipct = 0;
+      for (int i=0;i<64;i++) skipct += blockStatus[i];
+      blockStatus[mod_WB] -= 1;
+      write_ct = 0;
+
+    }
+     
+  }
+
+}
+
+
+	    
+// MAIN of program
+	
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_capture_manythread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit for writing */
+  dada_hdu_t* hdu_out = 0;
+  
+  // input data block HDU key
+  key_t out_key = CAPTURE_BLOCK_KEY;
+
+  // command line arguments
+  int core = -1;
+  int chgroup = 0;
+  int arg=0;
+  char dada_fnam[200]; // filename for dada header
+  char iface[100]; // IP for data packets
+  
+  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {	      
+	      strcpy(iP,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'g':
+	  if (optarg)
+	    {	      
+	      chgroup = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {	      
+	      strcpy(iface,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'p':
+	  if (optarg)
+	    {
+	      dPort = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-p flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'q':
+	  if (optarg)
+	    {
+	      cPort = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-q flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'f':
+	  if (optarg)
+	    {	      
+	      strcpy(dada_fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	 
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // START THREADS
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  udpdb_t temp_str;
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort);
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  
+  // OPEN CONNECTION TO DADA DB FOR WRITING
+
+  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
+  
+  hdu_out  = dada_hdu_create (0);
+  if (DEBUG) syslog(DEBUG,"Created hdu");
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog(LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_out);
+    syslog(LOG_ERR,"could not lock to output dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  syslog(LOG_INFO,"opened connection to output DB");
+
+  // DEAL WITH DADA HEADER
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+  if (DEBUG) syslog(DEBUG,"read header2");
+
+  if (fileread (dada_fnam, hout, 4096) < 0)
+    {
+      free (hout);
+      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
+      return (EXIT_FAILURE);
+    }
+
+  
+  if (DEBUG) syslog(DEBUG,"read header3");
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // copy the in header to the out header
+  memcpy (header_out, hout, 4096);
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
+    {
+      syslog(LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  sprintf(STATE,"LISTEN");
+  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
+
+
+  /* time to start up receiver. 
+     data are captured on iface:CAPTURE_PORT 
+  */
+
+  // make recv, write, and stats structs  
+  udpdb_t udpdb[nth];
+  dsaX_stats_t stats;
+  dsaX_write_t writey[nwth];
+
+  // shared variables and memory
+  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  char * tblock = (char *)malloc(sizeof(char)*bufsz*64);
+  stats_t * packets = init_stats_t();
+  stats_t * bytes = init_stats_t();
+  reset_stats_t(packets);
+  reset_stats_t(bytes);
+
+  // initialise stats struct
+  stats.packets = packets;
+  stats.bytes = bytes;
+
+  // initialise writey struct and open buffer
+  for (int i=0;i<nwth;i++) {
+    writey[i].hdu = hdu_out;
+    writey[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+    writey[i].block_open = 0;
+    writey[i].tblock = tblock;
+    writey[i].thread_id = i;    
+  }
+  dsaX_udpdb_open_buffer (&writey[0]);
+
+  // initialise all udpdb structs
+  for (int i=0;i<nth;i++) {
+
+    // shared stuff
+    udpdb[i].packets = packets;
+    udpdb[i].bytes = bytes;
+    udpdb[i].tblock = tblock;
+
+    // the rest
+    udpdb[i].port = dPort;
+    udpdb[i].interface = strdup(iface);
+    udpdb[i].hdu_bufsz = bufsz;
+    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
+    udpdb[i].num_inputs = NSNAPS;
+    udpdb[i].verbose = 0;
+    udpdb[i].rcv_sleeps = 0;
+    
+    udpdb[i].thread_id = i;    
+    
+  }
+
+
+  /* start threads */
+    
+  // start the stats thread
+  pthread_t stats_thread_id;
+  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started stats_thread()");
+
+  // start the receive threads
+  pthread_t recv_thread_id[nth];  
+  rval = 0;
+  for (int i=0;i<nth;i++) {
+    rval = pthread_create (&recv_thread_id[i], 0, (void *) recv_thread, (void *) (&udpdb[i]));
+    if (rval != 0) {
+      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "Created recv threads");
+
+  // start the write thread
+  pthread_t write_thread_id[nwth];
+  rval = 0;
+  for (int i=0;i<nwth;i++) {
+    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&writey[i]));
+    if (rval != 0) {
+      syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "started write threads");  
+
+  while (!quit_threads) {
+    sleep(1);
+  }
+  
+  // close threads
+  syslog(LOG_INFO, "joining all threads");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  pthread_join (stats_thread_id, &result);
+  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
+  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
+  
+  free(tblock);
+  dsaX_dbgpu_cleanup (hdu_out);
+
+}
diff --git a/legacy/dsaX_capture_manythread.c.bak b/legacy/dsaX_capture_manythread.c.bak
new file mode 100644
index 0000000..e3fd2b6
--- /dev/null
+++ b/legacy/dsaX_capture_manythread.c.bak
@@ -0,0 +1,1053 @@
+/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
+
+main: runs capture loop, and interfaces dada buffer
+control_thread: deals with control commands
+
+*/
+
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture_manythread.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int quit_threads = 0;
+char STATE[20];
+uint64_t UTC_START = 10000;
+uint64_t UTC_STOP = 40000000000;
+int MONITOR = 0;
+char iP[100];
+int DEBUG = 0;
+int HISTOGRAM[16];
+int writeBlock = 0;
+const int nth = 8;
+const int nwth = 4;
+int cores[8] = {30,31,32,33,34,35,36,37};
+int write_cores[4] = {17,18,19,39};
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+volatile int doWrite = 0;
+volatile int skipBlock = 0;
+volatile int skipping = 0;
+volatile int lWriteBlock = 0;
+volatile int write_ct = 0;
+volatile uint64_t last_seq = 0;
+volatile int skipct = 0;
+volatile uint64_t block_count = 0;
+volatile uint64_t block_start_byte=0, block_end_byte=0;
+volatile  unsigned capture_started = 0;
+volatile char * wblock;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+void usage();
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_capture [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -j IP to listen on for data packets [no default]\n"
+	   " -i IP to listen on for control commands [no default]\n"	
+	   " -f filename of template dada header [no default]\n"
+	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -d send debug messages to syslog\n"
+	   " -g chgroup [default 0]\n"
+	   " -h print usage\n");
+}
+
+// open a socket
+dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx);
+dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx)
+{
+
+  // prepare structure
+  syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure");
+  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
+  assert(b != NULL);
+  b->bufsz = sizeof(char) * UDP_PAYLOAD;
+  b->buf = (char *) malloc (b->bufsz);
+  assert(b->buf != NULL);
+  b->have_packet = 0;
+  b->fd = 0;
+
+  // connect to socket
+  syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, ctx->port);
+
+  // open socket
+  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
+  b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  assert(b->fd>=0);
+
+  // for multiple connections
+  int one = 1;
+  setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one));
+  
+  struct sockaddr_in udp_sock;
+  bzero(&(udp_sock.sin_zero), 8);                     // clear the struct
+  udp_sock.sin_family = AF_INET;                      // internet/IP
+  udp_sock.sin_port = htons(ctx->port);                    // set the port number
+  udp_sock.sin_addr.s_addr = inet_addr(ctx->interface);  // from a specific IP address 
+
+  if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) {
+    syslog(LOG_ERR, "prepare: failed to bind to socket");
+    return -1;
+  }
+  
+  // set the socket size to 256 MB
+  int sock_buf_size = 256*1024*1024;
+  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
+  dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size);
+
+  // set the socket to non-blocking
+  syslog(LOG_INFO, "prepare: setting non_block");
+  sock_nonblock(b->fd);
+
+  // clear any packets buffered by the kernel
+  syslog(LOG_INFO, "prepare: clearing packets at socket");
+  size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD);
+
+  return b;
+}
+
+
+
+// close a socket
+void dsaX_free_sock(dsaX_sock_t* b);
+void dsaX_free_sock(dsaX_sock_t* b)
+{
+  b->fd = 0;
+  b->bufsz = 0;
+  b->have_packet =0;
+  if (b->buf)
+    free (b->buf);
+  b->buf = 0;
+}
+
+/* 
+ *  open a data block buffer ready for direct access
+ */
+int dsaX_udpdb_open_buffer (dsaX_write_t * ctx);
+int dsaX_udpdb_open_buffer (dsaX_write_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
+
+  if (ctx->block_open)
+  {
+    syslog (LOG_ERR, "open_buffer: buffer already opened");
+    return -1;
+  }
+
+  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
+
+  uint64_t block_id = 0;
+
+  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
+  if (!wblock)
+  { 
+    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
+    return -1;
+  }
+
+  ctx->block_open = 1;
+
+  return 0;
+}
+
+/*
+ *  close a data buffer, assuming a full block has been written
+ */
+int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod);
+int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
+
+  if (!ctx->block_open)
+  { 
+    syslog (LOG_ERR, "close_buffer: buffer already closed");
+    return -1;
+  }
+
+  // log any buffers that are not full, except for the 1 byte "EOD" buffer
+  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
+    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
+              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
+              bytes_written, ctx->hdu_bufsz);
+
+  if (eod)
+  {
+    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
+      return -1;
+    }
+  }
+  else 
+  {
+    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
+      return -1;
+    }
+  }
+
+  wblock = 0;
+  ctx->block_open = 0;
+
+  return 0;
+}
+
+/* 
+ *  move to the next ring buffer element. return pointer to base address of new buffer
+ */
+int dsaX_udpdb_new_buffer (dsaX_write_t * ctx);
+int dsaX_udpdb_new_buffer (dsaX_write_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
+
+  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
+    return -1;
+  }
+
+  if (dsaX_udpdb_open_buffer (ctx) < 0) 
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+  return 0;
+
+}
+
+// increment counters when block is full
+void dsaX_udpdb_increment (udpdb_t * ctx);
+void dsaX_udpdb_increment (udpdb_t * ctx)
+{
+
+  // increment buffer byte markers
+  writeBlock++;
+  block_start_byte = block_end_byte + UDP_DATA;
+  block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
+  block_count = 0;
+
+}
+
+
+
+/* --------- THREADS -------- */
+
+// STATS THREAD
+
+/* 
+ *  Thread to print simple capture statistics
+ */
+void stats_thread(void * arg) {
+  
+  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
+  uint64_t b_rcv_total = 0;
+  uint64_t b_rcv_1sec = 0;
+  uint64_t b_rcv_curr = 0;
+
+  uint64_t b_drp_total = 0;
+  uint64_t b_drp_1sec = 0;
+  uint64_t b_drp_curr = 0;
+
+  uint64_t s_rcv_total = 0;
+  uint64_t s_rcv_1sec = 0;
+  uint64_t s_rcv_curr = 0;
+
+  uint64_t ooo_pkts = 0;
+  float gb_rcv_ps = 0;
+  float mb_rcv_ps = 0;
+  float mb_drp_ps = 0;
+
+  syslog(LOG_INFO,"starting stats thread...");
+  sleep(2);
+  syslog(LOG_INFO,"started stats thread...");
+  
+  while (!quit_threads)
+  {
+
+    /* get a snapshot of the data as quickly as possible */
+    b_rcv_curr = ctx->bytes->received;
+    b_drp_curr = ctx->bytes->dropped;
+    
+    /* calc the values for the last second */
+    b_rcv_1sec = b_rcv_curr - b_rcv_total;
+    b_drp_1sec = b_drp_curr - b_drp_total;
+
+    /* update the totals */
+    b_rcv_total = b_rcv_curr;
+    b_drp_total = b_drp_curr;
+
+    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
+    mb_drp_ps = (double) b_drp_1sec / 1000000;
+    gb_rcv_ps = b_rcv_1sec * 8;
+    gb_rcv_ps /= 1000000000;    
+
+    /* determine how much memory is free in the receivers */
+    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct);
+
+    sleep(1);
+  }
+
+}
+
+// CONTROL THREAD
+
+void control_thread (void * arg) {
+
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = CAPTURE_CONTROL_PORT;
+  char sport[10];
+  sprintf(sport,"%d",port);
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+
+    // INTERPRET BUFFER STRING
+    // receive either UTC_START, UTC_STOP, MONITOR
+
+    // interpret buffer string
+    char * rest = buffer;
+    char *cmd, *val;
+    cmd = strtok_r(rest, "-", &rest);
+    val = strtok_r(rest, "-", &rest);
+    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
+
+    if (strcmp(cmd,"UTC_START")==0)
+      UTC_START = strtoull(val,&endptr,0);
+
+    if (strcmp(cmd,"UTC_STOP")==0)
+      UTC_STOP = strtoull(val,&endptr,0);    
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+
+/* 
+ *  Thread to capture data
+ */
+void recv_thread(void * arg) {
+
+  udpdb_t * udpdb = (udpdb_t *) arg;
+  int thread_id = udpdb->thread_id;
+    
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+
+  // set up socket
+  dsaX_sock_t * sock = dsaX_make_sock(udpdb);
+  
+  // DEFINITIONS
+  uint64_t tpack = 0;
+  uint64_t act_seq_no = 0;
+  uint64_t block_seq_no = 0;
+  uint64_t seq_no = 0;
+  uint64_t ant_id = 0;
+  unsigned char * b = (unsigned char *) sock->buf;
+  size_t got = 0; // data received from a recv_from call
+  int errsv; // determine the sequence number boundaries for curr and next buffers
+  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
+  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
+  // for "saving" out of order packets near edges of blocks
+  unsigned int temp_idx = 0;
+  unsigned int temp_max = 500;
+  char ** temp_buffers;
+  uint64_t * temp_seq_byte;
+  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
+  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
+  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
+  unsigned i = 0;
+  uint64_t timeouts = 0;
+  uint64_t timeout_max = 1000000000;
+  int canWrite = 0;
+  int ct_snaps=0;
+  int mod_WB;
+
+  // infinite loop to receive packets
+
+  while (!quit_threads)
+    {
+
+      sock->have_packet = 0; 
+
+      // incredibly tight loop to try and get a packet
+      while (!sock->have_packet)
+	{
+	 
+	  // receive 1 packet into the socket buffer
+	  got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
+
+	  if (got == UDP_PAYLOAD) 
+	    {
+	      sock->have_packet = 1;
+	    } 
+	  else if (got == -1) 
+	    {
+	      errsv = errno;
+	      if (errsv == EAGAIN) 
+		{
+		  if (capture_started)
+		    timeouts++;
+		  //if (timeouts > timeout_max)
+		  //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
+		}
+	      else 
+		{
+		  //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
+		  return EXIT_FAILURE;
+		}
+	    } 
+	  else // we received a packet of the WRONG size, ignore it
+	    {
+	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	    }
+	}
+      timeouts = 0;
+
+      // we have a valid packet within the timeout
+      if (sock->have_packet) 
+	{
+
+	  // decode packet header (64 bits)
+	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
+	  seq_no = 0;
+	  seq_no |=  (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7;
+	  seq_no |=  (((uint64_t)(sock->buf[3])) << 3) & 2040;
+	  seq_no |=  (((uint64_t)(sock->buf[2])) << 11) & 522240;
+	  seq_no |=  (((uint64_t)(sock->buf[1])) << 19) & 133693440;
+	  seq_no |=  (((uint64_t)(sock->buf[0])) << 27) & 34225520640;
+	  ant_id = 0;
+	  ant_id |= (unsigned char) (sock->buf[6]) << 8;
+	  ant_id |= (unsigned char) (sock->buf[7]);
+	  
+	  act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
+	  block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
+
+	  // set shared last_seq
+	  pthread_mutex_lock(&mutex);
+	  last_seq = seq_no;
+	  //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq);
+	  pthread_mutex_unlock(&mutex);
+	  
+	  // check for starting or stopping condition, using continue
+	  if (canWrite==0) {
+	    if (seq_no >= UTC_START-50 && UTC_START != 10000) {
+	      canWrite=1;	      
+	    }
+	  }
+	  if (canWrite == 0) continue;
+
+	  // threadsafe start of capture
+	  pthread_mutex_lock(&mutex);
+	  if (!(capture_started))
+	    {
+	      block_start_byte = block_seq_no * UDP_DATA;
+	      block_end_byte   = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
+	      capture_started = 1;
+
+	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte);
+	    }
+	  pthread_mutex_unlock(&mutex);
+
+	  // if capture running
+	  if (capture_started)
+	    {
+	      seq_byte = (act_seq_no * UDP_DATA);
+	      tpack++;
+	      
+	      // packet belongs in this block
+	      if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte))
+		{
+		  byte_offset = seq_byte - (block_start_byte);
+		  mod_WB = writeBlock % 64;
+		  memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA);		  
+		  pthread_mutex_lock(&mutex);		  
+		  block_count++;
+		  //syslog(LOG_INFO,"block count %"PRIu64"",block_count);
+		  pthread_mutex_unlock(&mutex);
+		  
+		}
+	      // packet belongs in subsequent block
+	      else if (seq_byte > block_end_byte)
+		{
+		      
+		  if (temp_idx < temp_max)
+		    {
+		      // save packet to temp buffer
+		      memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA);
+		      temp_seq_byte[temp_idx] = seq_byte;
+		      temp_idx++;
+		    }
+		}	    
+	    }
+	  
+	  // threadsafe end of block
+	  pthread_mutex_lock(&mutex);
+	  if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
+	    {
+	      syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", "
+		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
+		      "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id,  block_count, 
+		      temp_idx,writeBlock);
+
+	      // write block
+	      // check whether doWrite has been released. If not, skip this block
+	      if (doWrite==1) skipBlock=1;
+	      else doWrite=1;
+	      
+	      uint64_t dropped = udpdb->packets_per_buffer - (block_count);
+	      udpdb->packets->received += (block_count);
+	      udpdb->bytes->received += (block_count) * UDP_DATA;	      
+	      if (dropped)
+		{
+		  udpdb->packets->dropped += dropped;
+		  udpdb->bytes->dropped += (dropped * UDP_DATA);
+		}
+
+	      // increment counters
+	      dsaX_udpdb_increment(udpdb);	      	
+
+	      // write temp queue for this thread
+	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
+	      tpack = 0;
+	
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - (block_start_byte);
+		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
+		    {
+		      mod_WB = writeBlock % 64;
+		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
+		      //pthread_mutex_lock(&mutex);
+		      block_count++;		      
+		      //pthread_mutex_unlock(&mutex);
+		    }
+		}
+	      temp_idx = 0;
+       
+	    }
+	  pthread_mutex_unlock(&mutex);
+
+	  // at this stage, can try and write temp queue safely for other threads
+	  if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0)
+	    {
+	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
+	      tpack = 0;
+	
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - (block_start_byte);
+		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
+		    {
+		      mod_WB = writeBlock % 64;
+		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
+		      pthread_mutex_lock(&mutex);
+		      block_count++;		      
+		      pthread_mutex_unlock(&mutex);
+		    }
+		}
+	      temp_idx = 0;
+
+	    }
+
+	}
+
+      // packet has been inserted or saved by this point
+      sock->have_packet = 0;
+	
+    }
+
+  dsaX_free_sock(sock);
+  free(temp_buffers);
+  free(temp_seq_byte);
+  
+}
+
+/* 
+ *  Thread to write data
+ */
+void write_thread(void * arg) {
+
+  dsaX_write_t * udpdb = (dsaX_write_t *) arg;
+  int thread_id = udpdb->thread_id;
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = write_cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+    
+  int mod_WB = 0;
+  int a;
+  
+  while (!quit_threads)
+  {
+
+    while (!doWrite) {
+      a=1;
+    }    
+
+    // assume everything is set up
+    // wblock is assigned, write_ct=0
+    
+    mod_WB = lWriteBlock % 64;
+    memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz  + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth);
+
+    pthread_mutex_lock(&mutex);
+    write_ct++;
+    pthread_mutex_unlock(&mutex);
+
+    //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id);
+
+    // now wait until thread 0 has finished getting a new block before moving on
+    if (thread_id>0) {
+      while (write_ct!=0) a=1;
+    }
+    else {
+
+      // wait for all sub-blocks to be written
+      while (write_ct<nwth) a=1;
+
+      // get new block
+      if (dsaX_udpdb_new_buffer (udpdb) < 0)
+	{
+	  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+	  return EXIT_FAILURE;
+	}
+
+      syslog(LOG_INFO,"write thread %d: written block... %d",thread_id,lWriteBlock);
+      lWriteBlock++;
+
+      // check for skipBlock - only log existence
+      if (skipBlock) {
+	skipct++;	
+      }
+      
+      // update doWrite and skipBlock
+      doWrite=0;
+      skipBlock=0;
+      write_ct = 0;
+
+    }
+     
+  }
+
+}
+
+
+	    
+// MAIN of program
+	
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_capture_manythread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit for writing */
+  dada_hdu_t* hdu_out = 0;
+  
+  // input data block HDU key
+  key_t out_key = CAPTURE_BLOCK_KEY;
+
+  // command line arguments
+  int core = -1;
+  int chgroup = 0;
+  int arg=0;
+  char dada_fnam[200]; // filename for dada header
+  char iface[100]; // IP for data packets
+  
+  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {	      
+	      strcpy(iP,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'g':
+	  if (optarg)
+	    {	      
+	      chgroup = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {	      
+	      strcpy(iface,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'f':
+	  if (optarg)
+	    {	      
+	      strcpy(dada_fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	 
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // START THREADS
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  udpdb_t temp_str;
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  
+  // OPEN CONNECTION TO DADA DB FOR WRITING
+
+  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
+  
+  hdu_out  = dada_hdu_create ();
+  if (DEBUG) syslog(DEBUG,"Created hdu");
+  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog(LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_out);
+    syslog(LOG_ERR,"could not lock to output dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  syslog(LOG_INFO,"opened connection to output DB");
+
+  // DEAL WITH DADA HEADER
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+  if (DEBUG) syslog(DEBUG,"read header2");
+
+  if (fileread (dada_fnam, hout, 4096) < 0)
+    {
+      free (hout);
+      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
+      return (EXIT_FAILURE);
+    }
+
+  
+  if (DEBUG) syslog(DEBUG,"read header3");
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // copy the in header to the out header
+  memcpy (header_out, hout, 4096);
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
+    {
+      syslog(LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  sprintf(STATE,"LISTEN");
+  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
+
+
+  /* time to start up receiver. 
+     data are captured on iface:CAPTURE_PORT 
+  */
+
+  // make recv, write, and stats structs  
+  udpdb_t udpdb[nth];
+  dsaX_stats_t stats;
+  dsaX_write_t writey[nwth];
+
+  // shared variables and memory
+  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  char * tblock = (char *)malloc(sizeof(char)*bufsz*64);
+  stats_t * packets = init_stats_t();
+  stats_t * bytes = init_stats_t();
+  reset_stats_t(packets);
+  reset_stats_t(bytes);
+
+  // initialise stats struct
+  stats.packets = packets;
+  stats.bytes = bytes;
+
+  // initialise writey struct and open buffer
+  for (int i=0;i<nwth;i++) {
+    writey[i].hdu = hdu_out;
+    writey[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+    writey[i].block_open = 0;
+    writey[i].tblock = tblock;
+    writey[i].thread_id = i;    
+  }
+  dsaX_udpdb_open_buffer (&writey[0]);
+
+  // initialise all udpdb structs
+  for (int i=0;i<nth;i++) {
+
+    // shared stuff
+    udpdb[i].packets = packets;
+    udpdb[i].bytes = bytes;
+    udpdb[i].tblock = tblock;
+
+    // the rest
+    udpdb[i].port = CAPTURE_PORT;
+    udpdb[i].interface = strdup(iface);
+    udpdb[i].hdu_bufsz = bufsz;
+    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
+    udpdb[i].num_inputs = NSNAPS;
+    udpdb[i].verbose = 0;
+    udpdb[i].rcv_sleeps = 0;
+    
+    udpdb[i].thread_id = i;    
+    
+  }
+
+
+  /* start threads */
+    
+  // start the stats thread
+  pthread_t stats_thread_id;
+  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started stats_thread()");
+
+  // start the receive threads
+  pthread_t recv_thread_id[nth];  
+  rval = 0;
+  for (int i=0;i<nth;i++) {
+    rval = pthread_create (&recv_thread_id[i], 0, (void *) recv_thread, (void *) (&udpdb[i]));
+    if (rval != 0) {
+      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "Created recv threads");
+
+  // start the write thread
+  pthread_t write_thread_id[nwth];
+  rval = 0;
+  for (int i=0;i<nwth;i++) {
+    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&writey[i]));
+    if (rval != 0) {
+      syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "started write threads");  
+
+  while (!quit_threads) {
+    sleep(1);
+  }
+  
+  // close threads
+  syslog(LOG_INFO, "joining all threads");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  pthread_join (stats_thread_id, &result);
+  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
+  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
+  
+  free(tblock);
+  dsaX_dbgpu_cleanup (hdu_out);
+
+}
diff --git a/legacy/dsaX_capture_manythread.h b/legacy/dsaX_capture_manythread.h
new file mode 100644
index 0000000..3c96648
--- /dev/null
+++ b/legacy/dsaX_capture_manythread.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#ifndef __DSAX_UDPDB_THREAD_H
+#define __DSAX_UDPDB_THREAD_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+/* socket buffer for receiving udp data */
+// this is initialised in each recv thread
+typedef struct {
+
+  int           fd;            // FD of the socket
+  size_t        bufsz;         // size of socket buffer
+  char *        buf;          // the socket buffer
+  int           have_packet;   // 
+  size_t        got;           // amount of data received
+
+} dsaX_sock_t;
+
+dsaX_sock_t * dsaX_init_sock ();
+void dsaX_free_sock(dsaX_sock_t* b);
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+// structure for write thread
+// tblock must be shared
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * block;             // pointer to current datablock buffer
+  char            * tblock;            // area of memory to write to
+  int               thread_id;
+
+} dsaX_write_t;
+
+// structure for stats thread
+// both are shared between all recv structures and this one
+// last_seq is also shared
+typedef struct {
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t * last_seq;                     // most recently received seq number
+
+} dsaX_stats_t;
+
+
+// structure for receive thread
+// tblock, packets, bytes, last_seq, block_start_byte, block_end_byte, block_count, capture_started
+typedef struct {
+
+  multilog_t *      log;                // DADA logging interface
+  int               verbose;            // verbosity flag 
+
+  int               port;               // port to receive UDP data 
+  int               control_port;       // port to receive control commands
+  char *            interface;          // IP Address to accept packets on 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+
+  // datablock management
+  uint64_t        * block_start_byte;  // seq_byte of first byte for the block
+  uint64_t        * block_end_byte;    // seq_byte of first byte of final packet of the block
+  uint64_t        * block_count;       // number of packets in this block  
+  uint64_t          hdu_bufsz;
+  char            * tblock;            // area of memory to write to
+  
+  // packets
+  unsigned        * capture_started;      // flag for start of UDP data
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t rcv_sleeps;
+
+  uint64_t * last_seq;                     // most recently received seq number
+  struct   timeval timeout;
+  int thread_id;
+
+} udpdb_t;
+
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
+
+#endif
diff --git a/legacy/dsaX_capture_pcap.c b/legacy/dsaX_capture_pcap.c
new file mode 100644
index 0000000..4921c68
--- /dev/null
+++ b/legacy/dsaX_capture_pcap.c
@@ -0,0 +1,852 @@
+/* dsaX_capture_pcap.c: Code to capture packets using pf_ring aware pcap and write to a dada buffer.
+
+control and stats threads: standard threads
+recv thread: simply runs pcap_loop, passing packets to callback function
+packet_callback: places packets directly into dada buffer, or temp buffer. gets new buffer if needed
+
+everything is in the dsaX_t structure
+
+
+*/
+
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture_pcap.h"
+#include "dsaX_def.h"
+#include "pcap.h"
+
+/* global variables */
+int quit_threads = 0;
+char STATE[20];
+uint64_t UTC_START = 10000;
+uint64_t UTC_STOP = 40000000000;
+int MONITOR = 0;
+char iP[100];
+int DEBUG = 0;
+int HISTOGRAM[16];
+int cores[2] = {17,19};
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+volatile int canWrite = 0;
+volatile  unsigned capture_started = 0;
+volatile char * wblock;
+volatile uint64_t last_seq;
+const int nth = 1;
+const int nwth = 1;
+const int TEMP_MAXY = 1000;
+volatile int skipped = 0;
+const int NBLOCKS = 8;
+volatile uint64_t writeBlock[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+volatile int delayBlock = 0;
+volatile int behindBlock = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+void usage();
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_capture [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -i IP to listen on for control commands [no default]\n"	
+	   " -f filename of template dada header [no default]\n"
+	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -d send debug messages to syslog\n"
+	   " -h print usage\n");
+}
+
+/* 
+ *  open a data block buffer ready for direct access
+ */
+int dsaX_udpdb_open_buffer (dsaX_t * ctx);
+int dsaX_udpdb_open_buffer (dsaX_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
+
+  if (ctx->block_open)
+  {
+    syslog (LOG_ERR, "open_buffer: buffer already opened");
+    return -1;
+  }
+
+  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
+
+  uint64_t block_id = 0;
+
+  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
+  if (!wblock)
+  { 
+    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
+    return -1;
+  }
+
+  ctx->block_open = 1;
+
+  return 0;
+}
+
+/*
+ *  close a data buffer, assuming a full block has been written
+ */
+int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod);
+int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
+
+  if (!ctx->block_open)
+  { 
+    syslog (LOG_ERR, "close_buffer: buffer already closed");
+    return -1;
+  }
+
+  // log any buffers that are not full, except for the 1 byte "EOD" buffer
+  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
+    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
+              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
+              bytes_written, ctx->hdu_bufsz);
+
+  if (eod)
+  {
+    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
+      return -1;
+    }
+  }
+  else 
+  {
+    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
+      return -1;
+    }
+  }
+
+  wblock = 0;
+  ctx->block_open = 0;
+
+  return 0;
+}
+
+/* 
+ *  move to the next ring buffer element. return pointer to base address of new buffer
+ */
+int dsaX_udpdb_new_buffer (dsaX_t * ctx);
+int dsaX_udpdb_new_buffer (dsaX_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
+
+  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
+    return -1;
+  }
+
+  if (dsaX_udpdb_open_buffer (ctx) < 0) 
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+  return 0;
+
+}
+
+// increment counters when block is full
+void dsaX_udpdb_increment (dsaX_t * ctx);
+void dsaX_udpdb_increment (dsaX_t * ctx)
+{
+
+  // increment buffer byte markers
+  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
+  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
+  ctx->block_count = 0;
+
+}
+
+
+
+/* --------- THREADS -------- */
+
+// STATS THREAD
+
+/* 
+ *  Thread to print simple capture statistics
+ */
+void stats_thread(void * arg) {
+  
+  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
+  uint64_t b_rcv_total = 0;
+  uint64_t b_rcv_1sec = 0;
+  uint64_t b_rcv_curr = 0;
+
+  uint64_t b_drp_total = 0;
+  uint64_t b_drp_1sec = 0;
+  uint64_t b_drp_curr = 0;
+
+  uint64_t s_rcv_total = 0;
+  uint64_t s_rcv_1sec = 0;
+  uint64_t s_rcv_curr = 0;
+
+  uint64_t ooo_pkts = 0;
+  float gb_rcv_ps = 0;
+  float mb_rcv_ps = 0;
+  float mb_drp_ps = 0;
+
+  syslog(LOG_INFO,"starting stats thread...");
+  sleep(2);
+  syslog(LOG_INFO,"started stats thread...");
+  
+  while (!quit_threads)
+  {
+
+    /* get a snapshot of the data as quickly as possible */
+    b_rcv_curr = ctx->bytes->received;
+    b_drp_curr = ctx->bytes->dropped;
+    
+    /* calc the values for the last second */
+    b_rcv_1sec = b_rcv_curr - b_rcv_total;
+    b_drp_1sec = b_drp_curr - b_drp_total;
+
+    /* update the totals */
+    b_rcv_total = b_rcv_curr;
+    b_drp_total = b_drp_curr;
+
+    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
+    mb_drp_ps = (double) b_drp_1sec / 1000000;
+    gb_rcv_ps = b_rcv_1sec * 8;
+    gb_rcv_ps /= 1000000000;    
+
+    /* determine how much memory is free in the receivers */
+    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, behindBlock, skipped);
+
+    sleep(1);
+  }
+
+}
+
+// CONTROL THREAD
+
+void control_thread (void * arg) {
+
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = CAPTURE_CONTROL_PORT;
+  char sport[10];
+  sprintf(sport,"%d",port);
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+
+    // INTERPRET BUFFER STRING
+    // receive either UTC_START, UTC_STOP, MONITOR
+
+    // interpret buffer string
+    char * rest = buffer;
+    char *cmd, *val;
+    cmd = strtok_r(rest, "-", &rest);
+    val = strtok_r(rest, "-", &rest);
+    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
+
+    if (strcmp(cmd,"UTC_START")==0)
+      UTC_START = strtoull(val,&endptr,0);
+
+    if (strcmp(cmd,"UTC_STOP")==0)
+      UTC_STOP = strtoull(val,&endptr,0);    
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+/*
+This is important - packet callback function to place packets in buffer
+called upon single packet being received
+*/
+void packet_callback(u_char *args, const struct pcap_pkthdr* header, const u_char* packet) {
+
+  dsaX_t * udpdb = (dsaX_t *) args;
+
+  // make sure packet has right length and get payload
+  if (header->len != UDP_PAYLOAD + 42) {
+    syslog(LOG_INFO,"received packet with length %d, total available %d",header->len,header->caplen);
+    return;
+  }
+  char *buf = (char *)(packet + 42);
+  
+  // process packet header
+  uint64_t seq_no=0, ant_id=0;
+  seq_no |=  (((uint64_t)(buf[4]) & 224) >> 5) & 7;
+  seq_no |=  (((uint64_t)(buf[3])) << 3) & 2040;
+  seq_no |=  (((uint64_t)(buf[2])) << 11) & 522240;
+  seq_no |=  (((uint64_t)(buf[1])) << 19) & 133693440;
+  seq_no |=  (((uint64_t)(buf[0])) << 27) & 34225520640;
+  ant_id |= (unsigned char) (buf[6]) << 8;
+  ant_id |= (unsigned char) (buf[7]);	  
+  uint64_t act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
+  uint64_t block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
+  last_seq = seq_no;
+    
+  // check for starting condition
+  if (canWrite==0) {
+    if (seq_no >= UTC_START-500 && UTC_START != 10000) {
+      canWrite=1;	      
+    }
+  }
+  if (canWrite == 0) return;
+
+  // deal with start of capture
+  if (!(capture_started))
+    {
+      udpdb->block_start_byte = block_seq_no * UDP_DATA;
+      udpdb->block_end_byte   = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
+      capture_started = 1;      
+      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte);
+    }
+
+  // if capture has started, do good stuff
+  uint64_t byte_offset, seq_byte;
+  if (capture_started) {
+
+    seq_byte = (act_seq_no * UDP_DATA);
+
+    // packet belongs in this block
+    if ((seq_byte <= udpdb->block_end_byte) && (seq_byte >= udpdb->block_start_byte))
+      {
+	byte_offset = seq_byte - (udpdb->block_start_byte);
+	memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, buf + UDP_HEADER, UDP_DATA);	
+	//memcpy(wblock + byte_offset, buf + UDP_HEADER, UDP_DATA);
+	udpdb->block_count++;
+      }
+    // packet belongs in subsequent block
+    else if (seq_byte > udpdb->block_end_byte)
+      {
+	if (udpdb->temp_idx < TEMP_MAXY)
+	  {
+	    // save packet to temp buffer
+	    memcpy (udpdb->temp_buffers + udpdb->temp_idx*UDP_DATA, buf + UDP_HEADER, UDP_DATA);
+	    udpdb->temp_seq_byte[udpdb->temp_idx] = seq_byte;
+	    udpdb->temp_idx++;
+	  }
+      }
+  }
+
+  // end of block
+  if ((udpdb->block_count >= udpdb->packets_per_buffer) || (udpdb->temp_idx >= TEMP_MAXY))
+    {
+      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
+	      "ant_id=%"PRIu16", block_count=%"PRIu64", "
+	      "temp_idx=%d", seq_no, ant_id,
+	      udpdb->block_count, udpdb->temp_idx);
+
+      // set write block on this block
+      if (writeBlock[udpdb->tblock_idx]==1)
+	skipped++;
+      writeBlock[udpdb->tblock_idx] = 1;
+      
+      // increment tblock_idx
+      udpdb->tblock_idx+=1;
+      if (udpdb->tblock_idx==NBLOCKS)
+	udpdb->tblock_idx = 0;
+
+      // get delay_block
+      udpdb->nblocks_written++;
+      behindBlock = udpdb->nblocks_written - delayBlock;
+      
+      // deal with counters
+      uint64_t dropped = udpdb->packets_per_buffer - (udpdb->block_count);
+      udpdb->packets->received += (udpdb->block_count);
+      udpdb->bytes->received += (udpdb->block_count) * UDP_DATA;
+      if (dropped)
+	{
+	  udpdb->packets->dropped += dropped;
+	  udpdb->bytes->dropped += (dropped * UDP_DATA);
+	}
+      dsaX_udpdb_increment(udpdb);
+
+      // write temp queue
+      for (int i=0; i < udpdb->temp_idx; i++) {
+	seq_byte = udpdb->temp_seq_byte[i];
+	byte_offset = seq_byte - udpdb->block_start_byte;
+	if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) {
+	  memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, udpdb->temp_buffers + i*UDP_DATA, UDP_DATA);
+	  udpdb->block_count++;
+	}
+      }
+      udpdb->temp_idx = 0;
+
+    }	  
+ 
+}
+
+// Thread to do writing
+
+void write_thread(void * arg) {
+
+  dsaX_t * udpdb = (dsaX_t *) arg;
+  int thread_id = 2;
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[1];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+
+  int a, lWriteBlock=0;
+  while (!quit_threads) {
+
+    // busywait
+    while (writeBlock[lWriteBlock]==0)
+      a=1;
+
+    // write block
+    memcpy(wblock, udpdb->tblock + lWriteBlock*UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK, UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK);
+
+    // get new block
+    if (dsaX_udpdb_new_buffer (udpdb) < 0)
+      {
+	syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+	return EXIT_FAILURE;
+      }
+
+    // increment counters    
+    writeBlock[lWriteBlock] = 0;
+    lWriteBlock++;
+    if (lWriteBlock==NBLOCKS)
+      lWriteBlock = 0;
+    delayBlock++;
+    
+  }
+}
+
+/*
+Thread to run pcap, passing to callback function
+*/
+
+void pcap_thread(void * arg) {
+
+  dsaX_t * udpdb = (dsaX_t *) arg;
+  int thread_id = 1;//udpdb->thread_id;
+    
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[0];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+
+  // set up pcap from port CAPTURE_PORT
+  char dev[] = "eth0";
+  pcap_t *handle;
+  char error_buffer[PCAP_ERRBUF_SIZE];
+  struct bpf_program filter;
+  char filter_exp[] = "port 4011";
+  bpf_u_int32 subnet_mask, ip;
+
+  if (pcap_lookupnet(dev, &ip, &subnet_mask, error_buffer) == -1) {
+    syslog(LOG_ERR,"Could not get information for device: %s", dev);
+    ip = 0;
+    subnet_mask = 0;
+  }
+  handle = pcap_open_live(dev, 4659, 0, 1, error_buffer);
+  if (handle == NULL) {
+    syslog(LOG_ERR,"Could not open %s - %s", dev, error_buffer);
+    return 2;
+  }
+  
+  if (pcap_compile(handle, &filter, filter_exp, 1, ip) == -1) {
+    syslog(LOG_ERR,"Bad filter - %s", pcap_geterr(handle));
+    return 2;
+  }
+  if (pcap_setfilter(handle, &filter) == -1) {
+    syslog(LOG_ERR,"Error setting filter - %s\n", pcap_geterr(handle));
+    return 2;
+  }
+
+  /*  if((pcap_set_buffer_size(handle, 2*1024*1024))!=0)
+    {
+      syslog(LOG_ERR, "Could not set buffer size");
+      return 2;
+      }*/
+
+  
+  syslog(LOG_INFO,"thread %d: successfully set up pcap",thread_id);
+
+  // start up RX!
+  while (!quit_threads)
+    pcap_loop(handle, 0, packet_callback, (u_char*)udpdb);
+
+  // finish
+  pcap_close(handle);
+  
+}
+
+
+	    
+// MAIN of program
+	
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_capture_pcap", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit for writing */
+  dada_hdu_t* hdu_out = 0;
+  
+  // input data block HDU key
+  key_t out_key = CAPTURE_BLOCK_KEY;
+
+  // command line arguments
+  int core = -1;
+  int arg=0;
+  char dada_fnam[200]; // filename for dada header
+  
+  while ((arg=getopt(argc,argv,"c:i:f:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {	      
+	      strcpy(iP,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'f':
+	  if (optarg)
+	    {	      
+	      strcpy(dada_fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	 
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // START THREADS
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  dsaX_t temp_str;
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  
+  // OPEN CONNECTION TO DADA DB FOR WRITING
+
+  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
+  
+  hdu_out  = dada_hdu_create ();
+  if (DEBUG) syslog(DEBUG,"Created hdu");
+  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog(LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_out);
+    syslog(LOG_ERR,"could not lock to output dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  syslog(LOG_INFO,"opened connection to output DB");
+
+  // DEAL WITH DADA HEADER
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+  if (DEBUG) syslog(DEBUG,"read header2");
+
+  if (fileread (dada_fnam, hout, 4096) < 0)
+    {
+      free (hout);
+      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
+      return (EXIT_FAILURE);
+    }
+
+  
+  if (DEBUG) syslog(DEBUG,"read header3");
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // copy the in header to the out header
+  memcpy (header_out, hout, 4096);
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
+    {
+      syslog(LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  sprintf(STATE,"LISTEN");
+  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
+
+
+  /* time to start up receiver. 
+  */
+
+  // make recv, write, and stats structs  
+  dsaX_t udpdb[nth];
+  dsaX_stats_t stats;
+
+  // shared variables and memory
+  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);  
+  stats_t * packets = init_stats_t();
+  stats_t * bytes = init_stats_t();
+  reset_stats_t(packets);
+  reset_stats_t(bytes);
+  char * tblock = (char *)malloc(sizeof(char)*NBLOCKS*(ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block)));
+  char * temp_buffers = (char *)malloc(sizeof(char)*TEMP_MAXY*UDP_DATA);
+  char * temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*TEMP_MAXY);
+  
+  // initialise stats struct
+  stats.packets = packets;
+  stats.bytes = bytes;
+
+  for (int i=0;i<nth;i++) {
+
+    udpdb[i].hdu = hdu_out;
+    udpdb[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+    udpdb[i].block_open = 0;
+    udpdb[i].block_count = 0;
+    udpdb[i].tblock = tblock;
+    udpdb[i].tblock_idx = 0;
+    udpdb[i].temp_buffers = temp_buffers;
+    udpdb[i].temp_seq_byte = temp_seq_byte;
+    udpdb[i].temp_idx = 0;
+    udpdb[i].thread_id = 1;
+    udpdb[i].verbose = 0;
+    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
+    udpdb[i].packets = packets;
+    udpdb[i].bytes = bytes;
+    udpdb[i].nblocks_written = 0;
+
+  }    
+  dsaX_udpdb_open_buffer (&udpdb[0]);
+
+  /* start threads */
+    
+  // start the stats thread
+  pthread_t stats_thread_id;
+  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started stats_thread()");
+
+  // start the receive threads
+  pthread_t recv_thread_id[nth];  
+  rval = 0;
+  for (int i=0;i<nth;i++) {
+    rval = pthread_create (&recv_thread_id[i], 0, (void *) pcap_thread, (void *) (&udpdb[i]));
+    if (rval != 0) {
+      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "Created recv threads");
+
+  // start the write threads
+  pthread_t write_thread_id[nwth];  
+  rval = 0;
+  for (int i=0;i<nwth;i++) {
+    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&udpdb[i]));
+    if (rval != 0) {
+      syslog(LOG_ERR, "Error creating write_thread %d: %s", i,strerror(rval));
+      return -1;
+    }
+  }
+  syslog(LOG_NOTICE, "Created write threads");
+
+  
+  while (!quit_threads) {
+    sleep(1);
+  }
+  
+  // close threads
+  syslog(LOG_INFO, "joining all threads");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  pthread_join (stats_thread_id, &result);
+  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
+  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
+  
+  free(tblock);
+  free(temp_buffers);
+  free(temp_seq_byte);
+  dsaX_dbgpu_cleanup (hdu_out);
+
+}
diff --git a/legacy/dsaX_capture_pcap.h b/legacy/dsaX_capture_pcap.h
new file mode 100644
index 0000000..f037f75
--- /dev/null
+++ b/legacy/dsaX_capture_pcap.h
@@ -0,0 +1,83 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+// structure for all threads
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * tblock;  
+  uint64_t          tblock_idx;
+  char            * temp_buffers;
+  uint64_t        * temp_seq_byte;
+  int               temp_idx;
+  int               thread_id;
+  uint64_t          block_start_byte;
+  uint64_t          block_end_byte;
+  uint64_t          block_count;
+  int               nblocks_written;
+  
+  int               verbose;            // verbosity flag 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+  
+  // packets
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+
+  uint64_t last_seq;                     // most recently received seq number
+
+} dsaX_t;
+
+// structure for stats thread
+// both are shared between all recv structures and this one
+// last_seq is also shared
+typedef struct {
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t * last_seq;                     // most recently received seq number
+
+} dsaX_stats_t;
+
+
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
diff --git a/legacy/dsaX_capture_thread.c b/legacy/dsaX_capture_thread.c
new file mode 100644
index 0000000..49019be
--- /dev/null
+++ b/legacy/dsaX_capture_thread.c
@@ -0,0 +1,1107 @@
+/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
+
+main: runs capture loop, and interfaces dada buffer
+control_thread: deals with control commands
+
+*/
+
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int quit_threads = 0;
+char STATE[20];
+uint64_t UTC_START = 10000;
+uint64_t UTC_STOP = 40000000000;
+int MONITOR = 0;
+char iP[100];
+int DEBUG = 0;
+int HISTOGRAM[16];
+int writeBlock = 0;
+volatile int doWrite = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_capture [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -j IP to listen on for data packets [no default]\n"
+	   " -i IP to listen on for control commands [no default]\n"	
+	   " -f filename of template dada header [no default]\n"
+	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -d send debug messages to syslog\n"
+	   " -g chgroup [default 0]\n"
+	   " -h print usage\n");
+}
+
+/*
+ * create a socket with the specified number of buffers
+ */
+dsaX_sock_t * dsaX_init_sock ()
+{
+  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
+  assert(b != NULL);
+
+  b->bufsz = sizeof(char) * UDP_PAYLOAD;
+
+  b->buf = (char *) malloc (b->bufsz);
+  assert(b->buf != NULL);
+
+  b->have_packet = 0;
+  b->fd = 0;
+
+  return b;
+}
+
+void dsaX_free_sock(dsaX_sock_t* b)
+{
+  b->fd = 0;
+  b->bufsz = 0;
+  b->have_packet =0;
+  if (b->buf)
+    free (b->buf);
+  b->buf = 0;
+}
+
+/* 
+ *  intialize UDP receiver resources
+ */
+int dsaX_udpdb_init_receiver (udpdb_t * ctx)
+{
+  syslog(LOG_INFO,"dsax_udpdb_init_receiver()");
+
+  // create a dsaX socket which can hold variable num of UDP packet
+  ctx->sock = dsaX_init_sock();
+
+  ctx->ooo_packets = 0;
+  ctx->recv_core = -1;
+  ctx->n_sleeps = 0;
+  ctx->mb_rcv_ps = 0;
+  ctx->mb_drp_ps = 0;
+  ctx->block_open = 0;
+  ctx->block_count = 0;
+  ctx->capture_started = 0;
+  ctx->last_seq = 0;
+  ctx->last_byte = 0;
+  ctx->block_start_byte = 0;
+
+  // allocate required memory strucutres
+  ctx->packets = init_stats_t();
+  ctx->bytes   = init_stats_t();
+  return 0;
+}
+
+/* 
+prepare socket and writer
+*/
+
+int dsaX_udpdb_prepare (udpdb_t * ctx)
+{
+  syslog(LOG_INFO, "dsaX_udpdb_prepare()");
+
+  // open socket
+  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
+  ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose);
+  if (ctx->sock->fd < 0) {
+    syslog (LOG_ERR, "Error, Failed to create udp socket");
+    return -1;
+  }
+
+  
+  // set the socket size to 256 MB
+  int sock_buf_size = 256*1024*1024;
+  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
+  dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size);
+
+  // set the socket to non-blocking
+  syslog(LOG_INFO, "prepare: setting non_block");
+  sock_nonblock(ctx->sock->fd);
+
+  // clear any packets buffered by the kernel
+  syslog(LOG_INFO, "prepare: clearing packets at socket");
+  size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD);
+
+  // setup the next_seq to the initial value
+  //ctx->last_seq = 0;
+  //ctx->last_byte = 0;
+  //ctx->n_sleeps = 0;
+
+  return 0;
+}
+
+/*
+ *  reset receiver before an observation commences
+ */
+void dsaX_udpdb_reset_receiver (udpdb_t * ctx) 
+{
+  syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()");
+
+  ctx->capture_started = 0;
+  ctx->last_seq = 0;
+  ctx->last_byte = 0;
+  ctx->n_sleeps = 0;
+
+  reset_stats_t(ctx->packets);
+  reset_stats_t(ctx->bytes);
+}
+
+/* 
+ *  open a data block buffer ready for direct access
+ */
+int dsaX_udpdb_open_buffer (udpdb_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
+
+  if (ctx->block_open)
+  {
+    syslog (LOG_ERR, "open_buffer: buffer already opened");
+    return -1;
+  }
+
+  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
+
+  uint64_t block_id = 0;
+
+  ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
+  if (!ctx->block)
+  { 
+    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
+    return -1;
+  }
+
+  ctx->block_open = 1;
+
+  return 0;
+}
+
+/*
+ *  close a data buffer, assuming a full block has been written
+ */
+int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
+
+  if (!ctx->block_open)
+  { 
+    syslog (LOG_ERR, "close_buffer: buffer already closed");
+    return -1;
+  }
+
+  // log any buffers that are not full, except for the 1 byte "EOD" buffer
+  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
+    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
+              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
+              bytes_written, ctx->hdu_bufsz);
+
+  if (eod)
+  {
+    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
+      return -1;
+    }
+  }
+  else 
+  {
+    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
+    {
+      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
+      return -1;
+    }
+  }
+
+  ctx->block = 0;
+  ctx->block_open = 0;
+
+  return 0;
+}
+
+// increment counters when block is full
+int dsaX_udpdb_increment (udpdb_t * ctx)
+{
+
+  // increment buffer byte markers
+  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
+  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
+  ctx->block_count = 0;
+  if (writeBlock==0) writeBlock=1;
+  else writeBlock=0;
+
+}
+
+/* 
+ *  move to the next ring buffer element. return pointer to base address of new buffer
+ */
+int dsaX_udpdb_new_buffer (udpdb_t * ctx)
+{
+
+  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
+
+  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
+    return -1;
+  }
+
+  if (dsaX_udpdb_open_buffer (ctx) < 0) 
+  {
+    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+
+  // set block to 0
+  //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte);
+  
+  if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", 
+             ctx->block_start_byte, ctx->block_end_byte);
+
+  return 0;
+
+}
+
+/* 
+ *  destroy UDP receiver resources 
+ */
+int dsaX_udpdb_destroy_receiver (udpdb_t * ctx)
+{
+  if (ctx->sock)
+    dsaX_free_sock(ctx->sock);
+  ctx->sock = 0;
+}
+
+/*
+ * Close the udp socket and file
+ */
+
+int udpdb_stop_function (udpdb_t* ctx)
+{
+
+  syslog(LOG_INFO, "stop: dada_hdu_unlock_write()");
+  if (dada_hdu_unlock_write (ctx->hdu) < 0)
+  {
+    syslog (LOG_ERR, "stop: could not unlock write on");
+    return -1;
+  }
+
+  // close the UDP socket
+  close(ctx->sock->fd);
+
+  if (ctx->packets->dropped)
+  {
+    double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte;
+    percent *= 100;
+
+    syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %",
+             ctx->bytes->dropped, ctx->last_byte, percent);
+  }
+
+  return 0;
+}
+
+
+
+
+/* --------- THREADS -------- */
+
+// STATS THREAD
+
+/* 
+ *  Thread to print simple capture statistics
+ */
+void stats_thread(void * arg) {
+
+  /*  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = 4;
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+  */
+  
+  udpdb_t * ctx = (udpdb_t *) arg;
+  uint64_t b_rcv_total = 0;
+  uint64_t b_rcv_1sec = 0;
+  uint64_t b_rcv_curr = 0;
+
+  uint64_t b_drp_total = 0;
+  uint64_t b_drp_1sec = 0;
+  uint64_t b_drp_curr = 0;
+
+  uint64_t s_rcv_total = 0;
+  uint64_t s_rcv_1sec = 0;
+  uint64_t s_rcv_curr = 0;
+
+  uint64_t ooo_pkts = 0;
+  float gb_rcv_ps = 0;
+  float mb_rcv_ps = 0;
+  float mb_drp_ps = 0;
+
+  syslog(LOG_INFO,"starting stats thread...");
+  sleep(2);
+  syslog(LOG_INFO,"started stats thread...");
+  
+  while (!quit_threads)
+  {
+
+    /* get a snapshot of the data as quickly as possible */
+    b_rcv_curr = ctx->bytes->received;
+    b_drp_curr = ctx->bytes->dropped;
+    s_rcv_curr = ctx->n_sleeps;
+    
+    /* calc the values for the last second */
+    b_rcv_1sec = b_rcv_curr - b_rcv_total;
+    b_drp_1sec = b_drp_curr - b_drp_total;
+    s_rcv_1sec = s_rcv_curr - s_rcv_total;
+
+    /* update the totals */
+    b_rcv_total = b_rcv_curr;
+    b_drp_total = b_drp_curr;
+    s_rcv_total = s_rcv_curr;
+
+    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
+    mb_drp_ps = (double) b_drp_1sec / 1000000;
+    gb_rcv_ps = b_rcv_1sec * 8;
+    gb_rcv_ps /= 1000000000;    
+
+    /* determine how much memory is free in the receivers */
+    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64"", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq);
+
+    sleep(1);
+  }
+
+}
+
+
+
+
+
+
+
+// CONTROL THREAD
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = CAPTURE_CONTROL_PORT;
+  char sport[10];
+  sprintf(sport,"%d",port);
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+
+    // INTERPRET BUFFER STRING
+    // receive either UTC_START, UTC_STOP, MONITOR
+
+    // interpret buffer string
+    char * rest = buffer;
+    char *cmd, *val;
+    cmd = strtok_r(rest, "-", &rest);
+    val = strtok_r(rest, "-", &rest);
+    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
+
+    if (strcmp(cmd,"UTC_START")==0)
+      UTC_START = strtoull(val,&endptr,0);
+
+    if (strcmp(cmd,"UTC_STOP")==0)
+      UTC_STOP = strtoull(val,&endptr,0);    
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+
+/* 
+ *  Thread to capture data
+ */
+int recv_thread(void * arg) {
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = 34;
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+  
+  
+  udpdb_t * udpdb = (udpdb_t *) arg;
+
+    /* START WHAT WAS in RECV THREAD */
+
+  // DEFINITIONS
+
+  uint64_t act_seq_no = 0;
+  uint64_t block_seq_no = 0;
+  uint64_t seq_no = 0;
+  uint64_t ch_id = 0;
+  uint64_t ant_id = 0;
+  unsigned char * b = (unsigned char *) udpdb->sock->buf;
+  size_t got = 0; // data received from a recv_from call
+  int errsv; // determine the sequence number boundaries for curr and next buffers
+  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
+  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
+  // for "saving" out of order packets near edges of blocks
+  unsigned int temp_idx = 0;
+  unsigned int temp_max = 1000;
+  char ** temp_buffers; //[temp_max][UDP_DATA];
+  uint64_t * temp_seq_byte;
+  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
+  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
+  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
+  unsigned i = 0;
+  uint64_t timeouts = 0;
+  uint64_t timeout_max = 1000000000;
+  int canWrite = 0;
+  int ct_snaps=0;
+
+  // infinite loop to receive packets
+  // use stats thread to monitor STATE at this stage, to save resources here
+
+  while (!quit_threads)
+    {
+
+      udpdb->sock->have_packet = 0; 
+
+      // incredibly tight loop to try and get a packet
+      while (!udpdb->sock->have_packet)
+	{
+	 
+	  // receive 1 packet into the socket buffer
+	  got = recvfrom ( udpdb->sock->fd, udpdb->sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
+
+	  if (got == UDP_PAYLOAD) 
+	    {
+	      udpdb->sock->have_packet = 1;
+	    } 
+	  else if (got == -1) 
+	    {
+	      errsv = errno;
+	      if (errsv == EAGAIN) 
+		{
+		  udpdb->n_sleeps++;
+		  if (udpdb->capture_started)
+		    timeouts++;
+		  if (timeouts > timeout_max)
+		    syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
+		}
+	      else 
+		{
+		  syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
+		  return EXIT_FAILURE;
+		}
+	    } 
+	  else // we received a packet of the WRONG size, ignore it
+	    {
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
+	    }
+	}
+      timeouts = 0;
+
+      // we have a valid packet within the timeout
+      if (udpdb->sock->have_packet) 
+	{
+
+	  // decode packet header (64 bits)
+	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
+	  seq_no = 0;
+	  seq_no |=  (((uint64_t)(udpdb->sock->buf[4]) & 224) >> 5) & 7;
+	  seq_no |=  (((uint64_t)(udpdb->sock->buf[3])) << 3) & 2040;
+	  seq_no |=  (((uint64_t)(udpdb->sock->buf[2])) << 11) & 522240;
+	  seq_no |=  (((uint64_t)(udpdb->sock->buf[1])) << 19) & 133693440;
+	  seq_no |=  (((uint64_t)(udpdb->sock->buf[0])) << 27) & 34225520640;
+	  ant_id = 0;
+	  ant_id |= (unsigned char) (udpdb->sock->buf[6]) << 8;
+	  ant_id |= (unsigned char) (udpdb->sock->buf[7]);
+	  
+	  act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
+	  block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
+
+	  // check for starting or stopping condition, using continue
+	  if (canWrite==0) {
+	    if (seq_no >= UTC_START-50 && UTC_START != 10000) ct_snaps++;
+	    if (ct_snaps >= 10) canWrite=1;
+	  }
+	  udpdb->last_seq = seq_no;
+	  if (canWrite == 0) continue;
+	  
+	  // if first packet
+	  if (!udpdb->capture_started)
+	    {
+	      udpdb->block_start_byte = block_seq_no * UDP_DATA;
+	      udpdb->block_end_byte   = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
+	      udpdb->capture_started = 1;
+
+	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte);
+	    }
+
+	  // if capture running
+	  if (udpdb->capture_started)
+	    {
+	      seq_byte = (act_seq_no * UDP_DATA);	      
+
+	      udpdb->last_byte = seq_byte;
+	      
+	      // if packet arrived too late, ignore
+	      if (seq_byte < udpdb->block_start_byte)
+		{
+		  udpdb->packets->dropped++;
+		  udpdb->bytes->dropped += UDP_DATA;
+		}
+	      else
+		{
+		  // packet belongs in this block
+		  if (seq_byte <= udpdb->block_end_byte)
+		    {
+		      byte_offset = seq_byte - udpdb->block_start_byte;
+		      memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, udpdb->sock->buf + UDP_HEADER, UDP_DATA);
+		      udpdb->packets->received++;
+		      udpdb->bytes->received += UDP_DATA;
+		      udpdb->block_count++;
+		    }
+		  // packet belongs in subsequent block
+		  else
+		    {
+		      
+		      if (temp_idx < temp_max)
+			{
+			  // save packet to temp buffer
+			  memcpy (temp_buffers[temp_idx], udpdb->sock->buf + UDP_HEADER, UDP_DATA);
+			  temp_seq_byte[temp_idx] = seq_byte;
+			  temp_idx++;
+			}
+		      else
+			{
+			  udpdb->packets->dropped++;
+			  udpdb->bytes->dropped += UDP_DATA;
+			}
+		    }
+		}
+	    }
+
+	  // now check for a full buffer or full temp queue
+	  if ((udpdb->block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
+	    {
+	      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
+		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
+		      "temp_idx=%d\n", seq_no, ant_id,  udpdb->block_count, 
+		      temp_idx);
+
+	      // write block
+	      doWrite=1;
+	      
+	      uint64_t dropped = udpdb->packets_per_buffer - udpdb->block_count;
+	      if (dropped)
+		{
+		  udpdb->packets->dropped += dropped;
+		  udpdb->bytes->dropped += (dropped * UDP_DATA);
+		}
+
+	      // increment counters
+	      dsaX_udpdb_increment(udpdb);
+
+	      // write any temp packets saved
+
+	      if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb->block_start_byte, udpdb->block_end_byte);
+  
+	      // include any futuristic packets we saved
+	      for (i=0; i < temp_idx; i++)
+		{
+		  seq_byte = temp_seq_byte[i];
+		  byte_offset = seq_byte - udpdb->block_start_byte;
+		  if (byte_offset < udpdb->hdu_bufsz)
+		    {
+		      memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
+		      udpdb->block_count++;
+		      udpdb->packets->received++;
+		      udpdb->bytes->received += UDP_DATA;
+		    }
+		  else
+		    {
+		      udpdb->packets->dropped++;
+		      udpdb->bytes->dropped += UDP_DATA;
+		    }
+		}
+	      temp_idx = 0;
+	    }	     
+
+	}
+
+      // packet has been inserted or saved by this point
+      udpdb->sock->have_packet = 0;
+      
+	
+    }
+
+
+  free(temp_buffers);
+  free(temp_seq_byte);
+  
+}
+
+/* 
+ *  Thread to write data
+ */
+int write_thread(void * arg) {
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = 36;
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
+  
+  
+  udpdb_t * udpdb = (udpdb_t *) arg;
+  int lWriteBlock = 0;
+  int a;
+  
+  while (!quit_threads)
+  {
+
+    while (!doWrite) {
+      a=1;
+    }
+    
+    syslog(LOG_INFO,"writing block...");
+    
+    memcpy(udpdb->block, udpdb->tblock + lWriteBlock*udpdb->hdu_bufsz, udpdb->hdu_bufsz);
+    
+    if (dsaX_udpdb_new_buffer (udpdb) < 0)
+      {
+	syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
+	return EXIT_FAILURE;
+      }
+    
+    doWrite=0;
+    if (lWriteBlock==0) lWriteBlock=1;
+    else lWriteBlock=0;
+     
+  }
+
+}
+
+
+	    
+// MAIN of program
+	
+int main (int argc, char *argv[]) {
+
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_capture_thread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit for writing */
+  dada_hdu_t* hdu_out = 0;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // input data block HDU key
+  key_t out_key = CAPTURE_BLOCK_KEY;
+
+  // command line arguments
+  int core = -1;
+  int chgroup = 0;
+  int arg=0;
+  char dada_fnam[200]; // filename for dada header
+  char iface[100]; // IP for data packets
+  
+  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {	      
+	      strcpy(iP,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'g':
+	  if (optarg)
+	    {	      
+	      chgroup = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {	      
+	      strcpy(iface,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }      	
+	case 'f':
+	  if (optarg)
+	    {	      
+	      strcpy(dada_fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	 
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // record STATE info
+  sprintf(STATE,"NOBUFFER");
+
+  // START THREADS
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id, stats_thread_id;
+  if (DEBUG)
+    syslog (LOG_DEBUG, "Creating threads");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
+
+  // start the stats thread
+  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started stats_thread()");
+
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  // initialize the data structure
+  syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()");
+  if (dsaX_udpdb_init_receiver (&udpdb) < 0)
+  {
+    syslog (LOG_ERR, "could not initialize receiver");
+    return EXIT_FAILURE;
+  }
+  
+  
+  // OPEN CONNECTION TO DADA DB FOR WRITING
+
+  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
+  
+  hdu_out  = dada_hdu_create (0);
+  if (DEBUG) syslog(DEBUG,"Created hdu");
+  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog(LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_out);
+    syslog(LOG_ERR,"could not lock to output dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  syslog(LOG_INFO,"opened connection to output DB");
+
+  // DEAL WITH DADA HEADER
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+  if (DEBUG) syslog(DEBUG,"read header2");
+
+  if (fileread (dada_fnam, hout, 4096) < 0)
+    {
+      free (hout);
+      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
+      return (EXIT_FAILURE);
+    }
+
+  
+  if (DEBUG) syslog(DEBUG,"read header3");
+
+  
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+
+  
+  // copy the in header to the out header
+  memcpy (header_out, hout, 4096);
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
+    {
+      syslog(LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  sprintf(STATE,"LISTEN");
+  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
+
+
+  /* time to start up receiver. 
+     data are captured on iface:CAPTURE_PORT 
+  */
+
+  printf("here\n");
+  
+  
+  // put information in udpdb struct
+  udpdb.hdu = hdu_out;
+  udpdb.port = CAPTURE_PORT;
+  udpdb.interface = strdup(iface);
+  udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);  
+  char * tblock = (char *)malloc(sizeof(char)*udpdb.hdu_bufsz);
+  udpdb.tblock = tblock;
+  // determine number of packets per block, must 
+  if (udpdb.hdu_bufsz % UDP_DATA != 0)
+  {
+    syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA);
+    return EXIT_FAILURE;
+  }
+  udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA;  
+  udpdb.bytes_to_acquire = 0;
+  udpdb.num_inputs = NSNAPS;
+
+  // prepare the socket
+  syslog(LOG_INFO, "main: dsaX_udpdb_prepare()");
+  if (dsaX_udpdb_prepare (&udpdb) < 0)
+  {
+    syslog(LOG_ERR, "could allocate required resources (prepare)");
+    return EXIT_FAILURE;
+  }
+  
+  // reset the receiver
+  syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()");
+  dsaX_udpdb_reset_receiver (&udpdb);
+
+  // open a block of the data block, ready for writing
+  if (dsaX_udpdb_open_buffer (&udpdb) < 0)
+  {
+    syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed");
+    return -1;
+  }
+
+
+  // start threads
+
+  // start recv thread
+  rval = 0;
+  pthread_t recv_thread_id, write_thread_id;
+  rval = pthread_create (&recv_thread_id, 0, (void *) recv_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating recv_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "Created recv thread");
+
+  // start the write thread
+  rval = pthread_create (&write_thread_id, 0, (void *) write_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
+    return -1;
+  }
+  syslog(LOG_NOTICE, "started write_thread()");  
+
+  while (!quit_threads) {
+    sleep(1);
+  }
+  
+  // close threads
+  syslog(LOG_INFO, "joining all threads");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  pthread_join (stats_thread_id, &result);
+  pthread_join (recv_thread_id, &result);
+  pthread_join (write_thread_id, &result);
+  
+  free(tblock);
+  
+  dsaX_dbgpu_cleanup (hdu_out);
+
+}
diff --git a/legacy/dsaX_copydb.c b/legacy/dsaX_copydb.c
new file mode 100644
index 0000000..7714038
--- /dev/null
+++ b/legacy/dsaX_copydb.c
@@ -0,0 +1,273 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block;
+  uint64_t written, block_id;
+
+
+  // set up
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    written = ipcio_write (hdu_out->data_block, block, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_cuda_correlator.cu b/legacy/dsaX_cuda_correlator.cu
new file mode 100644
index 0000000..3bebd09
--- /dev/null
+++ b/legacy/dsaX_cuda_correlator.cu
@@ -0,0 +1,309 @@
+// -*- c++ -*-
+/* will run xgpu */
+/* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/scatter.h>
+
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+//#include "cube/cube.h"
+#include "xgpu.h"
+ 
+
+#ifdef __MACH__
+#include <mach/mach_time.h>
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 0
+int clock_gettime(int clk_id, struct timespec *t){
+    mach_timebase_info_data_t timebase;
+    mach_timebase_info(&timebase);
+    uint64_t time;
+    time = mach_absolute_time();
+    double nseconds = ((double)time * (double)timebase.numer)/((double)timebase.denom);
+    double seconds = ((double)time * (double)timebase.numer)/((double)timebase.denom * 1e9);
+    t->tv_sec = seconds;
+    t->tv_nsec = nseconds;
+    return 0;
+}
+#else
+#include <time.h>
+#endif
+
+/*
+  Data ordering for input vectors is (running from slowest to fastest)
+  [time][channel][station][polarization][complexity]
+
+  Output matrix has ordering
+  [channel][station][station][polarization][polarization][complexity]
+*/
+
+int main(int argc, char** argv) {
+
+  int opt;
+  int i, j;
+  int device = 0;
+  unsigned int seed = 1;
+  int outer_count = 1;
+  int count = 1;
+  int syncOp = SYNCOP_SYNC_TRANSFER;
+  int finalSyncOp = SYNCOP_DUMP;
+  int verbose = 0;
+  int hostAlloc = 0;
+  XGPUInfo xgpu_info;
+  unsigned int npol, nstation, nfrequency;
+  int xgpu_error = 0;
+  Complex *omp_matrix_h = NULL;
+  struct timespec outer_start, start, stop, outer_stop;
+  double total, per_call, max_bw, gbps;
+#ifdef RUNTIME_STATS
+  struct timespec tic, toc;
+#endif
+
+  while ((opt = getopt(argc, argv, "C:c:d:f:ho:rs:v:")) != -1) {
+    switch (opt) {
+      case 'c':
+        // Set number of time to call xgpuCudaXengine
+        count = strtoul(optarg, NULL, 0);
+        if(count < 1) {
+          fprintf(stderr, "count must be positive\n");
+          return 1;
+        }
+        break;
+      case 'C':
+        // Set number of time to call xgpuCudaXengine
+        outer_count = strtoul(optarg, NULL, 0);
+        if(outer_count < 1) {
+          fprintf(stderr, "outer count must be positive\n");
+          return 1;
+        }
+        break;
+      case 'd':
+        // Set CUDA device number
+        device = strtoul(optarg, NULL, 0);
+        break;
+      case 'f':
+        // Set syncOp for final call
+        finalSyncOp = strtoul(optarg, NULL, 0);
+        break;
+      case 'o':
+        // Set syncOp
+        syncOp = strtoul(optarg, NULL, 0);
+        break;
+      case 'r':
+        // Register host allocated memory
+        hostAlloc = 1;
+        break;
+      case 's':
+        // Set seed for random data
+        seed = strtoul(optarg, NULL, 0);
+        break;
+      case 'v':
+        // Set verbosity level
+        verbose = strtoul(optarg, NULL, 0);
+        break;
+      default: /* '?' */
+        fprintf(stderr,
+            "Usage: %s [options]\n"
+            "Options:\n"
+            "  -c INTEG_CALLS    Calls to xgpuCudaXengine per integration [1]\n"
+            "  -C INTEG_COUNT    Number of integrations [1]\n"
+            "  -d DEVNUM         GPU device to use [0]\n"
+            "  -f FINAL_SYNCOP   Sync operation for final call [1]\n"
+            "  -o SYNCOP         Sync operation for all but final call [1]\n"
+            "                    Sync operation values are:\n"
+            "                         0 (no sync)\n"
+            "                         1 (sync and dump)\n"
+            "                         2 (sync host to device transfer)\n"
+            "                         3 (sync kernel computations)\n"
+            "  -r                Register host allocated memory [false]\n"
+            "                    (otherwise use CUDA allocated memory)\n"
+            "  -s SEED           Random number seed [1]\n"
+            "  -v {0|1|2|3}      Verbosity level (debug only) [0]\n"
+            "  -h                Show this message\n",
+            argv[0]);
+        exit(EXIT_FAILURE);
+    }
+  }
+
+  srand(seed);
+
+  // Get sizing info from library
+  xgpuInfo(&xgpu_info);
+  npol = xgpu_info.npol;
+  nstation = xgpu_info.nstation;
+  nfrequency = xgpu_info.nfrequency;
+
+  printf("Correlating %u stations with %u channels and integration length %u\n",
+	 xgpu_info.nstation, xgpu_info.nfrequency, xgpu_info.ntime);
+#ifndef FIXED_POINT
+  printf("Sending floating point data to GPU.\n");
+#else
+  printf("Sending fixed point data to GPU.\n");
+#endif
+
+  // perform host memory allocation
+
+  // allocate the GPU X-engine memory
+  XGPUContext context;
+  context.array_len = xgpu_info.vecLength;
+  context.matrix_len = xgpu_info.matLength;
+  context.array_h = NULL;
+  context.matrix_h = NULL;
+
+  xgpu_error = xgpuInit(&context, device);
+
+  ComplexInput *array_h = context.array_h; // this is pinned memory
+  Complex *cuda_matrix_h = context.matrix_h;
+
+  // create an array of complex noise
+  xgpuRandomComplex(array_h, xgpu_info.vecLength);
+
+  xgpuSwizzleInput(context.array_h, array_h);
+
+  // try copying to GPU
+  ComplexInput *array_hd;
+  cudaMalloc((void **)&array_hd, context.array_len*sizeof(ComplexInput));
+  cudaMemcpy(array_hd,context.array_h,context.array_len*sizeof(ComplexInput),cudaMemcpyHostToDevice);
+
+  // ompXengine always uses TRIANGULAR_ORDER
+  unsigned int ompMatLength = nfrequency * ((nstation+1)*(nstation/2)*npol*npol);
+  omp_matrix_h = (Complex *) malloc(ompMatLength*sizeof(Complex));
+  if(!omp_matrix_h) {
+    fprintf(stderr, "error allocating output buffer for xgpuOmpXengine\n");
+    goto cleanup;
+  }
+
+#if (CUBE_MODE == CUBE_DEFAULT && !defined(POWER_LOOP) )
+  // Only call CPU X engine if dumping GPU X engine exactly once
+  if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) {
+    printf("Calling CPU X-Engine\n");
+    xgpuOmpXengine(omp_matrix_h, array_h);
+  }
+#endif
+
+#define ELAPSED_MS(start,stop) \
+  ((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6)
+
+  printf("Calling GPU X-Engine\n");
+  clock_gettime(CLOCK_MONOTONIC, &outer_start);
+  for(j=0; j<outer_count; j++) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    for(i=0; i<count; i++) {
+#ifdef RUNTIME_STATS
+      clock_gettime(CLOCK_MONOTONIC, &tic);
+#endif
+      //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
+      xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp);
+#ifdef RUNTIME_STATS
+      clock_gettime(CLOCK_MONOTONIC, &toc);
+#endif
+      if(xgpu_error) {
+        fprintf(stderr, "xgpuCudaXengine returned error code %d\n", xgpu_error);
+        goto cleanup;
+      }
+#ifdef RUNTIME_STATS
+      fprintf(stderr, "%11.6f  %11.6f ms%s\n",
+          ELAPSED_MS(start,tic), ELAPSED_MS(tic,toc),
+          i==count-1 ? " final" : "");
+#endif
+    }
+    clock_gettime(CLOCK_MONOTONIC, &stop);
+    total = ELAPSED_MS(start,stop);
+    per_call = total/count;
+    // per_spectrum = per_call / NTIME
+    // per_channel = per_spectrum / NFREQUENCY
+    //             = per_call / (NTIME * NFREQUENCY)
+    // max_bw (kHz)  = 1 / per_channel = (NTIME * NFREQUENCY) / per_call
+    max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz
+    gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count)) / total / 1e6; // Gbps
+    printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
+        total, per_call);
+    printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
+        max_bw, gbps);
+  }
+  if(outer_count > 1) {
+    clock_gettime(CLOCK_MONOTONIC, &outer_stop);
+    total = ELAPSED_MS(outer_start,outer_stop);
+    per_call = total/(count*outer_count);
+    // per_spectrum = per_call / NTIME
+    // per_channel = per_spectrum / NFREQUENCY
+    //             = per_call / (NTIME * NFREQUENCY)
+    // max_bw (kHz)  = 1 / per_channel = (NTIME * NFREQUENCY) / per_call
+    max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz
+    gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count * outer_count)) / total / 1e6; // Gbps
+    printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
+        total, per_call);
+    printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
+        max_bw, gbps);
+  }
+
+#if (CUBE_MODE == CUBE_DEFAULT)
+  
+  // Only compare CPU and GPU X engines if dumping GPU X engine exactly once
+  if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) {
+    xgpuReorderMatrix(cuda_matrix_h);
+    xgpuCheckResult(cuda_matrix_h, omp_matrix_h, verbose, array_h);
+  }
+
+#if 0
+  int fullMatLength = nfrequency * nstation*nstation*npol*npol;
+  Complex *full_matrix_h = (Complex *) malloc(fullMatLength*sizeof(Complex));
+
+  // convert from packed triangular to full matrix
+  xgpuExtractMatrix(full_matrix_h, cuda_matrix_h);
+
+  free(full_matrix_h);
+#endif
+#endif
+
+cleanup:
+  //free host memory
+  free(omp_matrix_h);
+
+  // free gpu memory
+  xgpuFree(&context);
+  cudaFree(array_hd);
+
+#ifdef DP4A
+  free(array_h);
+#endif
+
+  /*  if(hostAlloc) {
+    free(context.array_h);
+    free(context.matrix_h);
+    }*/
+
+  return xgpu_error;
+}
diff --git a/legacy/dsaX_cutlass_interface.cu b/legacy/dsaX_cutlass_interface.cu
new file mode 100644
index 0000000..fc68d55
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.cu
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "dsaX_cutlass_interface.h"
+
+DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
+  problem_size(options.problem_size), batch_count(options.batch_count) {
+
+  // Allocate device memory for batched planar complex GEMM  
+  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  
+  ptr_A_real.reset(batch_count);
+  ptr_A_imag.reset(batch_count);
+  ptr_B_real.reset(batch_count);
+  ptr_B_imag.reset(batch_count);
+  ptr_C_real.reset(batch_count);
+  ptr_C_imag.reset(batch_count);
+  ptr_D_real.reset(batch_count);
+  ptr_D_imag.reset(batch_count);      
+}
+
+// DMH: Replace this with data from DSA-FTD
+void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
+
+  if(testing) {
+    uint64_t seed = 1234;
+    
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+    
+    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  } else {
+    // DMH: construct DSA-FTD interface data transfer interface
+  }
+
+  ptr_A = tensor_A.get();
+  ptr_B = tensor_B.get();
+  ptr_C = tensor_C.get();
+  ptr_D = tensor_D.get();
+  
+  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+  
+  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  
+  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+}
+
+Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
+  
+  Result result;
+  
+  initialize();  
+
+  // Configure pointers in global memory
+  struct {
+    Element *base;
+    void **ptr_real;
+    void **ptr_imag;
+    int64_t batch_stride;
+    int64_t imag_stride;
+  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
+  
+  for (auto const &tensor : tensors) {
+    for (int idx = 0; idx < batch_count; ++idx) {
+      
+      cudaError_t error;
+      void *ptr_real = tensor.base + idx * tensor.batch_stride;
+      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
+      
+      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+    }
+  }
+
+  
+  cudaEvent_t events[2];  
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  
+  // Record an event at the start of a series of GEMM operations
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+
+  // Run profiling loop
+  //-------------------
+  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+  // dispatch routines.
+  //
+  // Note, for planar complex array GEMM kernels, all numeric type arguments 
+  // specify the data type of the base real types. These are understood to
+  // apply to planar complex representations of matrices in memory and to complex<T>
+  // structures for scalars.
+  //
+  // See tools/library/include/cutlass/library/handle.h for more details.
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    
+    result.status = handle.gemm_planar_complex_array(
+	problem_size.m(),                                 // expected GEMM M dimension
+	problem_size.n(),                                 // expected GEMM N dimension
+	problem_size.k(),                                 // expected GEMM K dimension
+	batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+						     );
+    
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+      return result;
+    }
+  }
+  
+  // Record an event when the GEMM operations have been launched.
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  
+  if (handle.get_last_operation()) {
+    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+  }
+
+  // Compute reference in device code
+  if (options.reference_check) {
+    
+    result.passed = true;
+    
+    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+      // Define the GEMM through templates
+      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
+	(problem_size, options.alpha,
+	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+	 cutlass::ComplexTransform::kConjugate,
+	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+	 cutlass::ComplexTransform::kNone,
+	 options.beta,
+	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+	 );
+      
+      Element epsilon = 0.1_hf;
+      Element nonzero_floor = 0.1_hf;
+      
+      result.passed = BlockCompareRelativelyEqual
+	(
+	 tensor_D.get() + idx * batch_stride_D,
+	 tensor_D_ref.get() + idx * batch_stride_D,
+	 batch_stride_D,
+	 epsilon,
+	 nonzero_floor
+	 );
+    }
+    
+    if (result.passed) std::cout << "Reference check passed." << std::endl;
+    else std::cerr << "Error - reference check failed." << std::endl;
+  }
+  
+  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+  std::cout << " GFLOPs: " << result.gflops << std::endl;
+  
+  return result;
+}
+
+ int main(int argc, char const **args) {
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  
+  Options options;  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Compute GEMM
+  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
+  gemm.testing = true;
+  Result result = gemm.run(options);
+  
+  return result.passed ? 0 : -1;
+}
+
diff --git a/legacy/dsaX_cutlass_interface.cu~ b/legacy/dsaX_cutlass_interface.cu~
new file mode 100644
index 0000000..a51d5a2
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.cu~
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "dsaX_cutlass_interface.h"
+
+DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
+  problem_size(options.problem_size), batch_count(options.batch_count) {
+
+  // Allocate device memory for batched planar complex GEMM  
+  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  
+  ptr_A_real.reset(batch_count);
+  ptr_A_imag.reset(batch_count);
+  ptr_B_real.reset(batch_count);
+  ptr_B_imag.reset(batch_count);
+  ptr_C_real.reset(batch_count);
+  ptr_C_imag.reset(batch_count);
+  ptr_D_real.reset(batch_count);
+  ptr_D_imag.reset(batch_count);      
+}
+
+// DMH: Replace this with data from DSA-FTD
+void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
+
+  if(testing) {
+    uint64_t seed = 1234;
+    
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+    
+    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  } else {
+    // DMH: construct DSA-FTD interface data transfer interface
+  }
+
+  ptr_A = tensor_A.get();
+  ptr_B = tensor_B.get();
+  ptr_C = tensor_C.get();
+  ptr_D = tensor_D.get();
+  
+  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+  
+  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  
+  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+}
+
+Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
+  
+  Result result;
+  
+  initialize();  
+
+  // Configure pointers in global memory
+  struct {
+    Element *base;
+    void **ptr_real;
+    void **ptr_imag;
+    int64_t batch_stride;
+    int64_t imag_stride;
+  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
+  
+  for (auto const &tensor : tensors) {
+    for (int idx = 0; idx < batch_count; ++idx) {
+      
+      cudaError_t error;
+      void *ptr_real = tensor.base + idx * tensor.batch_stride;
+      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
+      
+      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+    }
+  }
+
+  
+  cudaEvent_t events[2];  
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  
+  // Record an event at the start of a series of GEMM operations
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+
+  // Run profiling loop
+  //-------------------
+  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+  // dispatch routines.
+  //
+  // Note, for planar complex array GEMM kernels, all numeric type arguments 
+  // specify the data type of the base real types. These are understood to
+  // apply to planar complex representations of matrices in memory and to complex<T>
+  // structures for scalars.
+  //
+  // See tools/library/include/cutlass/library/handle.h for more details.
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    
+    result.status = handle.gemm_planar_complex_array(
+	problem_size.m(),                                 // expected GEMM M dimension
+	problem_size.n(),                                 // expected GEMM N dimension
+	problem_size.k(),                                 // expected GEMM K dimension
+	batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+						     );
+    
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+      return result;
+    }
+  }
+  
+  // Record an event when the GEMM operations have been launched.
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  
+  if (handle.get_last_operation()) {
+    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+  }
+
+  // Compute reference in device code
+  if (options.reference_check) {
+    
+    result.passed = true;
+    
+    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+      // Define the GEMM through templates
+      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
+	(problem_size, options.alpha,
+	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+	 cutlass::ComplexTransform::kConjugate,
+	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+	 cutlass::ComplexTransform::kNone,
+	 options.beta,
+	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+	 );
+      
+      Element epsilon = 0.1_hf;
+      Element nonzero_floor = 0.1_hf;
+      
+      result.passed = BlockCompareRelativelyEqual
+	(
+	 tensor_D.get() + idx * batch_stride_D,
+	 tensor_D_ref.get() + idx * batch_stride_D,
+	 batch_stride_D,
+	 epsilon,
+	 nonzero_floor
+	 );
+    }
+    
+    if (result.passed) std::cout << "Reference check passed." << std::endl;
+    else std::cerr << "Error - reference check failed." << std::endl;
+  }
+  
+  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+  std::cout << " GFLOPs: " << result.gflops << std::endl;
+  
+  return result;
+}
+
+ int main(int argc, char const **args) {
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  
+  Options options;  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Compute GEMM
+  testing = true;
+  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
+  Result result = gemm.run(options);
+  
+  return result.passed ? 0 : -1;
+}
+
diff --git a/legacy/dsaX_cutlass_interface.h b/legacy/dsaX_cutlass_interface.h
new file mode 100644
index 0000000..5aa753e
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  Status status;
+  cudaError_t error;
+  bool passed;
+  
+  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+  bool help;
+  GemmCoord problem_size;
+  int batch_count;
+  complex<float> alpha;
+  complex<float> beta;
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(false),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    
+    CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "dsaX_cutlass_interface\n\n"
+	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+	<< "Options:\n\n"
+	<< "  --help                      If specified, displays this usage statement.\n\n"
+	<< "  --m=<int>                   GEMM M dimension\n"
+	<< "  --n=<int>                   GEMM N dimension\n"
+	<< "  --k=<int>                   GEMM K dimension\n"
+	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
+    
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+  // Half-precision input and output
+  using Element = half_t;
+  
+  // Configurations for layouts and internal computation
+  using LayoutA = ColumnMajor;
+  using LayoutB = ColumnMajor;
+  using LayoutC = ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  Handle handle;
+  
+  GemmCoord problem_size;
+  int batch_count;
+  DeviceAllocation<Element> tensor_A;
+  DeviceAllocation<Element> tensor_B;
+  DeviceAllocation<Element> tensor_C;
+  DeviceAllocation<Element> tensor_D;
+  DeviceAllocation<Element> tensor_D_ref;
+
+  DeviceAllocation<void *> ptr_A_real;
+  DeviceAllocation<void *> ptr_A_imag;
+  DeviceAllocation<void *> ptr_B_real;
+  DeviceAllocation<void *> ptr_B_imag;
+  DeviceAllocation<void *> ptr_C_real;
+  DeviceAllocation<void *> ptr_C_imag;
+  DeviceAllocation<void *> ptr_D_real;
+  DeviceAllocation<void *> ptr_D_imag;
+
+  Element *ptr_A;
+  Element *ptr_B;
+  Element *ptr_C;
+  Element *ptr_D;
+  
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+  
+  typename LayoutA::Stride::Index lda;
+  typename LayoutB::Stride::Index ldb;
+  typename LayoutC::Stride::Index ldc;
+  typename LayoutC::Stride::Index ldd;
+  
+  int64_t imag_stride_A;
+  int64_t imag_stride_B;
+  int64_t imag_stride_C;
+  int64_t imag_stride_D;
+  
+public:  
+  // Constructors
+  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+  DSA_FTD_ComplexGEMM_CUTLASS();
+  
+  // Methods
+  void initialize();  
+  Result run(Options const &options);
+  
+  bool testing;  
+};
+  
diff --git a/legacy/dsaX_cutlass_interface.h~ b/legacy/dsaX_cutlass_interface.h~
new file mode 100644
index 0000000..42a3e8a
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.h~
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  Status status;
+  cudaError_t error;
+  bool passed;
+  
+  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+  bool help;
+  GemmCoord problem_size;
+  int batch_count;
+  complex<float> alpha;
+  complex<float> beta;
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(false),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    
+    CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "dsaX_cutlass_interface\n\n"
+	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+	<< "Options:\n\n"
+	<< "  --help                      If specified, displays this usage statement.\n\n"
+	<< "  --m=<int>                   GEMM M dimension\n"
+	<< "  --n=<int>                   GEMM N dimension\n"
+	<< "  --k=<int>                   GEMM K dimension\n"
+	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
+    
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+  // Half-precision input and output
+  using Element = half_t;
+  
+  // Configurations for layouts and internal computation
+  using LayoutA = ColumnMajor;
+  using LayoutB = ColumnMajor;
+  using LayoutC = ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  Handle handle;
+  
+  GemmCoord problem_size;
+  int batch_count;
+  DeviceAllocation<Element> tensor_A;
+  DeviceAllocation<Element> tensor_B;
+  DeviceAllocation<Element> tensor_C;
+  DeviceAllocation<Element> tensor_D;
+  DeviceAllocation<Element> tensor_D_ref;
+
+  DeviceAllocation<void *> ptr_A_real;
+  DeviceAllocation<void *> ptr_A_imag;
+  DeviceAllocation<void *> ptr_B_real;
+  DeviceAllocation<void *> ptr_B_imag;
+  DeviceAllocation<void *> ptr_C_real;
+  DeviceAllocation<void *> ptr_C_imag;
+  DeviceAllocation<void *> ptr_D_real;
+  DeviceAllocation<void *> ptr_D_imag;
+
+  Element *ptr_A;
+  Element *ptr_B;
+  Element *ptr_C;
+  Element *ptr_D;
+  
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+  
+  typename LayoutA::Stride::Index lda;
+  typename LayoutB::Stride::Index ldb;
+  typename LayoutC::Stride::Index ldc;
+  typename LayoutC::Stride::Index ldd;
+  
+  int64_t imag_stride_A;
+  int64_t imag_stride_B;
+  int64_t imag_stride_C;
+  int64_t imag_stride_D;
+
+  bool testing;
+  
+public:  
+  // Constructors
+  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+  DSA_FTD_ComplexGEMM_CUTLASS();
+  
+  // Methods
+  void initialize();  
+  Result run(Options const &options);
+  
+  
+};
+  
diff --git a/legacy/dsaX_dbnic.c b/legacy/dsaX_dbnic.c
new file mode 100644
index 0000000..83e3e4a
--- /dev/null
+++ b/legacy/dsaX_dbnic.c
@@ -0,0 +1,435 @@
+/* simple nicdb
+
+will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now  
+
+*/
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+
+// data to pass to threads
+struct data {
+  char * out;
+  int sockfd;
+  struct sockaddr_in si_other;
+  int thread_id;
+  int chgroup;
+  int tseq;
+};
+
+/* global variables */
+int DEBUG = 0;
+int TEST = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_dbnic [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -g chgroup [default 0]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t TEST\n"
+	   " -i in_key [default BF_BLOCK_KEY]\n"
+	   " -w -x -y -z four ip addresses for corner turn\n"
+	   " -h print usage\n");
+}
+
+/* thread for data transmission */
+void * transmit(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int sockfd = d->sockfd;
+  struct sockaddr_in si_other = d->si_other;
+  char * output = (char *)(d->out);
+  int chgroup = d->chgroup;
+  int tseq = d->tseq;
+  char * packet = (char *)malloc(sizeof(char)*P_SIZE);
+  int * ipacket = (int *)(packet);
+
+
+  // for test packet
+  if (tseq==-1) {
+
+    ipacket[0] = chgroup;
+    sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other));
+
+  }
+  else {
+  
+    // fill op, doing transpose
+    char * op = (char *)malloc(sizeof(char)*(NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
+    //iop[0] = chgroup;
+    //iop[1] = tseq;
+    for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
+      for (int j=0;j<NBEAMS_PER_BLOCK;j++) {
+	for (int k=0;k<NW;k++) 
+	  // op[8+i*NBEAMS_PER_BLOCK*NW+j*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // no transpose
+	  op[j*NSAMPS_PER_TRANSMIT*NW+i*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // yes transpose
+      }
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"sending with chgroup %d tseq %d",chgroup,tseq);
+
+    // do transmit
+    // each packet is 12 bytes of header plus 8192 bytes of data
+    int val;
+    for (int i=0;i<NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12);i++) {
+
+      ipacket[0] = chgroup;
+      ipacket[1] = tseq;
+      ipacket[2] = i;
+      memcpy(packet+12,op+i*(P_SIZE-12),P_SIZE-12);
+      sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other));
+
+      //for (int ti=0;ti<NWAIT;ti++) val = ti*ti;
+      usleep(180);
+      
+    }
+    
+    if (DEBUG) syslog(LOG_INFO,"thread %d: written output",thread_id);
+
+    free(op);
+
+  }
+  
+  /* return 0 */
+  free(packet);
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_dbnic", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // threads
+  struct data args[4];
+  pthread_t threads[4];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  
+  // command line arguments
+  int core = -1;
+  int chgroup = 0;
+  int arg = 0;
+  char iP[4][20] = {"10.41.0.114", "10.41.0.87", "10.41.0.66", "10.41.0.60"}; 
+  // data block HDU keys
+  key_t in_key;
+  in_key = BF_BLOCK_KEY;
+
+  
+  while ((arg=getopt(argc,argv,"c:g:ti:w:x:y:z:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'g':
+	  if (optarg)
+	    {
+	      chgroup = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'w':
+	  strcpy(iP[0],optarg);
+	  break;
+	case 'x':
+	  strcpy(iP[1],optarg);
+	  break;
+	case 'y':
+	  strcpy(iP[2],optarg);
+	  break;
+	case 'z':
+	  strcpy(iP[3],optarg);
+	  break;
+	case 't':
+	  TEST=1;
+	  syslog (LOG_INFO, "Will use test pattern");
+	  break;
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+  	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size);
+  uint64_t  bytes_read = 0;
+  char *block;
+  uint64_t written, block_id;
+
+  
+  // set up
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+  int nthreads = NBMS / NBEAMS_PER_BLOCK;
+  
+  
+  // create socket connections
+  int sockfd[nthreads];
+  struct sockaddr_in servaddr[nthreads];
+
+  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  for (int i=0;i<nthreads;i++) {
+    memset((char *) &servaddr[i], 0, sizeof(servaddr[i]));
+    servaddr[i].sin_family = AF_INET;
+    servaddr[i].sin_addr.s_addr = inet_addr(iP[i]);
+    servaddr[i].sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
+  }
+  if (DEBUG) syslog(LOG_INFO,"sockets created");  
+
+  // send test packets
+
+  // put together args
+  for (int i=0; i<nthreads; i++) {
+    args[i].sockfd = sockfd[i];
+    args[i].si_other = servaddr[i];
+    args[i].thread_id = i;
+    args[i].chgroup = chgroup;
+    args[i].tseq = -1;
+  }
+  
+  for(int i=0; i<nthreads; i++){
+    if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
+      syslog(LOG_ERR,"Failed to create massage thread %d", i);
+    }
+  }
+  
+  pthread_attr_destroy(&attr);
+  
+  for(int i=0; i<nthreads; i++){
+    pthread_join(threads[i], &result);
+  }
+  
+  syslog(LOG_INFO,"Sent test packets");
+  
+  /*
+  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_STREAM, 0);
+  if (DEBUG) syslog(LOG_DEBUG,"sockets created");
+  for (int i=0;i<nthreads;i++) {
+    bzero(&servaddr, sizeof(servaddr));
+    servaddr.sin_family = AF_INET;
+    servaddr.sin_addr.s_addr = inet_addr(iP[i]);
+    servaddr.sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
+    if (connect(sockfd[i], (struct sockaddr *)&servaddr, sizeof(servaddr)) != 0) {
+      syslog(LOG_ERR,"connection with the server failed %d",i);
+      exit(0);
+    }
+    if (DEBUG) syslog(LOG_DEBUG,"connected %d",i);
+    }*/
+  
+  syslog(LOG_INFO, "starting observation");
+
+  /*
+  block has size/shape [NSAMPS_PER_TRANSMIT, NBMS, NW]
+  want to transmit [NBEAMS_PER_BLOCK, NSAMPS_PER_TRANSMIT, NW]
+  for test tone, populate with chgroup*10 + beam*NBMS/NBEAMS_PER_BLOCK + time*2/NSAMPS_PER_TRANSMIT
+  */
+  unsigned char * testblock = (unsigned char *)malloc(sizeof(unsigned char)*block_size);
+  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
+    for (int j=0;j<NBMS;j++) {
+      for (int k=0;k<NW;k++)
+	testblock[i*NBMS*NW + j*NW + k] = (unsigned char)(i/2);
+    }
+  }
+  
+  
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+
+    // put together args
+    for (int i=0; i<nthreads; i++) {
+      if (TEST) args[i].out = testblock;
+      else args[i].out = block;
+      args[i].sockfd = sockfd[i];
+      args[i].si_other = servaddr[i];
+      args[i].thread_id = i;
+      args[i].chgroup = chgroup;
+      args[i].tseq = blocks;
+    }
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
+	syslog(LOG_ERR,"Failed to create massage thread %d", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    //if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      //if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  for (int i=0;i<nthreads;i++) close(sockfd[i]);
+  free(testblock);
+  dsaX_dbgpu_cleanup (hdu_in);
+  
+}
+
+
diff --git a/legacy/dsaX_dbnic.c.bak b/legacy/dsaX_dbnic.c.bak
new file mode 100644
index 0000000..366f4c8
--- /dev/null
+++ b/legacy/dsaX_dbnic.c.bak
@@ -0,0 +1,381 @@
+/* simple nicdb
+
+will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now  
+
+*/
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+
+// data to pass to threads
+struct data {
+  char * out;
+  int sockfd; 
+  int thread_id;
+  int chgroup;
+  int tseq;
+};
+
+/* global variables */
+int DEBUG = 0;
+int TEST = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_dbnic [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -g chgroup [default 0]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t TEST\n"
+	   " -i in_key [default BF_BLOCK_KEY]\n"
+	   " -w -x -y -z four ip addresses for corner turn\n"
+	   " -h print usage\n");
+}
+
+/* thread for data transmission */
+void * transmit(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int sockfd = d->sockfd; 
+  char * output = (char *)(d->out);
+  char * op = (char *)malloc(sizeof(char)*(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
+  int * iop = (int *)(op);
+  int chgroup = d->chgroup;
+  int tseq = d->tseq;
+
+  // fill op, doing transpose
+  iop[0] = chgroup;
+  iop[1] = tseq;
+  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
+    for (int j=0;j<NBEAMS_PER_BLOCK;j++) {
+      for (int k=0;k<NW;k++) 
+	// op[8+i*NBEAMS_PER_BLOCK*NW+j*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // no transpose
+	op[8+j*NSAMPS_PER_TRANSMIT*NW+i*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // yes transpose
+    }
+  }
+
+  if (DEBUG) syslog(LOG_DEBUG,"sending with chgroup %d tseq %d",iop[0],iop[1]);
+  
+  // do transmit
+  int remain_data = (int)((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
+  int sent_bytes = 0, sbytes;
+  /*while (((sbytes = send(sockfd, op + sent_bytes, remain_data, 0))>0) && (remain_data > 0)) {
+    remain_data -= sbytes;
+    sent_bytes += sbytes;
+    }*/
+  sbytes = send(sockfd, op, remain_data, 0);
+  if (sbytes<remain_data)
+    syslog(LOG_ERR,"thread %d: only sent %d of %d",thread_id,sbytes,remain_data);
+
+  
+
+
+  //  write(sockfd, op, sizeof(op));
+
+  if (DEBUG) syslog(LOG_DEBUG,"thread %d: written output",thread_id);
+  
+  /* return 0 */
+  free(op);
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_dbnic", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // threads
+  struct data args[4];
+  pthread_t threads[4];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  
+  // command line arguments
+  int core = -1;
+  int chgroup = 0;
+  int arg = 0;
+  char iP[4][20] = {"10.41.0.22", "10.41.0.98", "10.41.0.105", "10.41.0.63"}; 
+  // data block HDU keys
+  key_t in_key;
+  in_key = BF_BLOCK_KEY;
+
+  
+  while ((arg=getopt(argc,argv,"c:g:ti:w:x:y:z:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'g':
+	  if (optarg)
+	    {
+	      chgroup = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'w':
+	  strcpy(iP[0],optarg);
+	  break;
+	case 'x':
+	  strcpy(iP[1],optarg);
+	  break;
+	case 'y':
+	  strcpy(iP[2],optarg);
+	  break;
+	case 'z':
+	  strcpy(iP[3],optarg);
+	  break;
+	case 't':
+	  TEST=1;
+	  syslog (LOG_INFO, "Will use test pattern");
+	  break;
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+  	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+  
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size);
+  uint64_t  bytes_read = 0;
+  char *block;
+  uint64_t written, block_id;
+
+  
+  // set up
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+  int nthreads = NBMS / NBEAMS_PER_BLOCK;
+  
+  
+  // create socket connections
+  int sockfd[nthreads];
+  struct sockaddr_in servaddr;
+  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_STREAM, 0);
+  if (DEBUG) syslog(LOG_DEBUG,"sockets created");
+  for (int i=0;i<nthreads;i++) {
+    bzero(&servaddr, sizeof(servaddr));
+    servaddr.sin_family = AF_INET;
+    servaddr.sin_addr.s_addr = inet_addr(iP[i]);
+    servaddr.sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
+    if (connect(sockfd[i], (struct sockaddr *)&servaddr, sizeof(servaddr)) != 0) {
+      syslog(LOG_ERR,"connection with the server failed %d",i);
+      exit(0);
+    }
+    if (DEBUG) syslog(LOG_DEBUG,"connected %d",i);
+  }
+  
+  syslog(LOG_INFO, "starting observation");
+
+  /*
+  block has size/shape [NSAMPS_PER_TRANSMIT, NBMS, NW]
+  want to transmit [NBEAMS_PER_BLOCK, NSAMPS_PER_TRANSMIT, NW]
+  for test tone, populate with chgroup*10 + beam*NBMS/NBEAMS_PER_BLOCK + time*2/NSAMPS_PER_TRANSMIT
+  */
+  unsigned char * testblock = (unsigned char *)malloc(sizeof(unsigned char)*block_size);
+  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
+    for (int j=0;j<NBMS;j++) {
+      for (int k=0;k<NW;k++)
+	testblock[i*NBMS*NW + j*NW + k] = (unsigned char)(i/2);
+    }
+  }
+  
+  
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+
+    // put together args
+    for (int i=0; i<nthreads; i++) {
+      if (TEST) args[i].out = testblock;
+      else args[i].out = block;
+      args[i].sockfd = sockfd[i];
+      args[i].thread_id = i;
+      args[i].chgroup = chgroup;
+      args[i].tseq = blocks;
+    }
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
+	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    //if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      //if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  for (int i=0;i<nthreads;i++) close(sockfd[i]);
+  free(testblock);
+  dsaX_dbgpu_cleanup (hdu_in);
+  
+}
+
+
diff --git a/legacy/dsaX_def.h b/legacy/dsaX_def.h
new file mode 100644
index 0000000..415e83b
--- /dev/null
+++ b/legacy/dsaX_def.h
@@ -0,0 +1,98 @@
+#ifndef __DSAX_DEF_H
+#define __DSAX_DEF_H
+
+#include "dada_def.h"
+
+// default dada block keys
+#define TEST_BLOCK_KEY 0x0000aada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*4=37748736 for 4 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*8=75497472 for 1 CHANG 1 SNAP 8 REORDER
+#define CAPTURE_BLOCK_KEY 0x0000dada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 150994944 for doSnap
+#define REORDER_BLOCK_KEY 0x0000eada // for reorder program.
+// 589824 for doSnap
+#define REORDER_BLOCK_KEY2 0x0000bada // for reorder program 2.
+// 128*32*1536*16*2*2=402653184 1 REORDER
+// 3221225472 for 8 REORDERS
+#define XGPU_BLOCK_KEY 0x0000fada // for xgpu program. 
+// 136*1536*2*8=3342336 
+#define COPY_BLOCK_KEY 0x0000dbda // for split off data
+#define BF_BLOCK_KEY 0x0000dcda // for beamformed data
+#define BF_BLOCK_KEY2 0x0000bcda // for beamformed data testing
+#define CAPTURED_BLOCK_KEY 0x0000abda // for capture program.
+#define BEAMCAPTURE_BLOCK_KEY 0x0000bbda // for capture bf program.
+
+// constants
+#define PI 3.14159265359
+#define CVAC 299792458.0
+
+// default number of XGPU ints
+#define NCORRINTS 128
+#define NNATINTS 32 // native number of integrations
+#define NREORDERS 1 // number of ints per reorder
+
+// size of xgpu output
+// TODO
+#define XGPU_SIZE 835584 // size of single output vector (post-GPU)
+#define XGPU_IN_INC 1 // size of input increment
+#define NBASE 4656 // nant*(nant+1)/2
+#define NPOL 2
+#define NCHAN 1536 // regardless of NCHANG
+
+// default port for packet capture
+#define CAPTURE_PORT 4011
+
+// default UDP packet dims
+#define UDP_HEADER   8              // size of header/sequence number
+#define UDP_DATA     4608           // obs bytes per packet
+#define UDP_PAYLOAD  4616           // header + datasize
+
+// number of channel groups to expect
+#define NCHANG 1
+
+// number of SNAPs to expect
+#define NSNAPS 32
+
+/* expect consecutive channel groups */
+#define CHOFF 1024 // offset in channels of first group
+
+// default control ports
+#define CAPTURE_CONTROL_PORT 11223
+#define REORDER_CONTROL_PORT 11224
+#define XGPU_CONTROL_PORT 11225
+#define WRITEVIS_CONTROL_PORT 11226
+#define TRIGGER_CONTROL_PORT 11227
+
+#define NPACKETS_PER_CALL 2048
+#define NPACKETS_PER_BLOCK 2048
+#define NPACKETS_INTS 2048 // number of packets per xgpu int
+#define NPACKETS_PER_FIL 2
+#define NPACKETS 2048
+#define NOUTBLOCKS 15 // number of input blocks stored by trigger
+#define NANTS 96
+#define NCHAN_PER_PACKET 384
+#define NBEAMS 512
+
+// for beamformer
+//#define sep 1.0 // arcmin
+#define NW 48 // number of weights per 384 chans. Also the number of channels formed
+#define NANT 63
+#define BEAM_OUT 23
+#define NSTREAMS 4
+#define NBP 8 // number of previous BPs to average
+
+// for second corner turn
+#define FIL_PORT0 6625 // port for first chan group
+#define NCLIENTS 16 // number of client dbnic processes to expect
+#define NSAMPS_PER_BLOCK 16384 // number of samples per block
+#define NCHAN_FIL 1024 // final number of filterband chans
+#define NBEAMS_PER_BLOCK 64 // number of beams to expect
+#define NSAMPS_PER_TRANSMIT 512 // number of samples transmitted at one time
+#define NBMS 256
+#define P_SIZE 4108
+#define NWAIT 100000
+
+#endif 
+
diff --git a/legacy/dsaX_fake.c b/legacy/dsaX_fake.c
new file mode 100644
index 0000000..662ea37
--- /dev/null
+++ b/legacy/dsaX_fake.c
@@ -0,0 +1,320 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f file to read packet from [default none]\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      useZ = 0;
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  uint64_t npackets = block_out / 4608;
+  char * block, * output_buffer;
+  char * packet;
+  packet = (char *)malloc(sizeof(char)*4608);
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // fill output buffer if file exists
+  FILE *fin;
+  if (!useZ) {
+
+    if (!(fin=fopen(fnam,"rb"))) {
+      syslog(LOG_ERR, "cannot open file - will write zeros");
+    }
+    else {
+
+      fread(packet,4608,1,fin);
+      fclose(fin);
+
+      syslog(LOG_INFO,"Read packet, npackets %lu",npackets);
+      
+      for (int i=0;i<npackets;i++)
+	memcpy(output_buffer+i*4608,packet,4608);
+
+      syslog(LOG_INFO, "Using input packet");
+      
+    }
+
+    
+  }
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+    // no need to do anything here - output_buffer is ready to go
+
+    // write to output
+    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(packet);
+  free(output_buffer);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_filTrigger.c b/legacy/dsaX_filTrigger.c
new file mode 100644
index 0000000..55f95fd
--- /dev/null
+++ b/legacy/dsaX_filTrigger.c
@@ -0,0 +1,559 @@
+/* Code to read from a single dada buffer, and write to disk upon receiving
+a trigger. Uses pthread threads and shared memory to listen. 
+Sequence of events:
+ - starts null-reading dump buffer, while listening for socket command
+   + for N second dump, assume N-second dada blocks
+ - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. 
+ - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending.
+*/
+
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+#include "dsaX_capture.h"
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+uint64_t specnum = 0;
+uint64_t next_specnum = 0;
+uint64_t procnum = 0;
+int trignum = 0;
+int dumpnum = 0;
+char iP[100];
+char footer_buf[1024];
+char next_footer_buf[1024];
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+int dada_bind_thread_to_core (int core);
+
+FILE *output;
+
+void send_string(char *string) /* includefile */
+{
+  int len;
+  len=strlen(string);
+  fwrite(&len, sizeof(int), 1, output);
+  fwrite(string, sizeof(char), len, output);
+}
+
+void send_float(char *name,float floating_point) /* includefile */
+{
+  send_string(name);
+  fwrite(&floating_point,sizeof(float),1,output);
+}
+
+void send_double (char *name, double double_precision) /* includefile */
+{
+  send_string(name);
+  fwrite(&double_precision,sizeof(double),1,output);
+}
+
+void send_int(char *name, int integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(int),1,output);
+}
+
+void send_char(char *name, char integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(char),1,output);
+}
+
+
+void send_long(char *name, long integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(long),1,output);
+}
+
+void send_coords(double raj, double dej, double az, double za) /*includefile*/
+{
+  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
+  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
+  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
+  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
+}
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in)
+{
+  
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_filTrigger [options]\n"
+	   " -c core   bind process to CPU core\n"
+	   " -i IP to listen to [no default]\n"
+	   " -j in_key [default eaea]\n"
+	   " -d debug\n"
+	   " -n output file name base [no default]\n"
+	   " -b beam number of first beam [default 0]\n"
+	   " -z respond to zero specnum\n"
+	   " -h print usage\n");
+}
+
+
+// Thread to control the dumping of data
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = ctx->control_port;
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,"11227",&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+    strcpy(tbuf,buffer);
+    trignum++;
+
+    // interpret buffer string
+    char * rest = buffer;
+    char tnam[100];
+    tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0));
+    strcpy(tnam,strtok_r(rest, "-", &rest));
+    
+    if (!dump_pending) {
+      //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
+      specnum = tmps/4;
+      strcpy(footer_buf,tnam);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf);
+    }
+	
+    if (dump_pending) {
+      syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam);
+      next_specnum = tmps/4;
+      strcpy(next_footer_buf,tnam);
+    }
+  
+    if (!dump_pending) dump_pending = 1;
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+  free (tbuf);
+
+  if (ctx->verbose)
+    syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+	    
+
+	
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_filTrigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+
+  /* port for control commands */
+  int control_port = TRIGGER_CONTROL_PORT;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // input data block HDU key
+  key_t in_key = 0x0000eaea;
+
+  // command line arguments
+  int core = -1;
+  int beamn = 0;
+  char of[200];
+  char foutnam[300];
+  char dirnam[300];
+  int rz=0;
+  int arg=0;
+
+  while ((arg=getopt(argc,argv,"i:c:j:db:n:hz")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -c flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'b':
+	  if (optarg)
+	    {
+	      beamn = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -b flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'n':
+	  if (optarg)
+	    {
+	      strcpy(of,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -n flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+	case 'z':
+	  rz=1;
+	  syslog (LOG_INFO, "Will respond to zero trigger");
+	  break;
+	case 'j':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // DADA stuff
+  
+  udpdb.verbose = DEBUG;
+  udpdb.control_port = control_port;
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  syslog(LOG_INFO, "starting control_thread()");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+
+  
+  syslog (LOG_INFO, "creating hdus");
+
+  // open connection to the in/read DBs
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      syslog(LOG_INFO,"binding to core %d", core);
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+    }
+
+  int observation_complete=0;
+  
+  // more DADA stuff - deal with headers
+  
+  uint64_t header_size = 0;
+
+  // read the header from the input HDU
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "main: could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  // mark the input header as cleared
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared [input]");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  
+  // stuff for writing data
+  /*
+    Data will have [64 beam, time, freq] for each block.
+    Need to extract 
+   */
+
+
+  
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  unsigned char * extData = (unsigned char *)malloc(sizeof(unsigned char)*NSAMPS_PER_BLOCK*NCHAN_FIL*NBEAMS_PER_BLOCK);
+  uint64_t specs_per_block = NSAMPS_PER_BLOCK;
+  uint64_t current_specnum = 0; // updates with each dada block read
+  uint64_t start_byte, bytes_to_copy, bytes_copied=0;
+  char * in_data;
+  uint64_t written=0;
+  uint64_t block_id, bytes_read=0;
+  int dumping = 0;
+  FILE *ofile;
+  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
+  fprintf(ofile,"starting...\n");
+  fclose(ofile);
+
+
+  // main reading loop
+  float pc_full = 0.;
+  
+  syslog(LOG_INFO, "main: starting observation");
+
+  while (!observation_complete) {
+    
+    // read a DADA block
+    in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    
+    // add delay
+    // only proceed if input data block is 80% full
+    while (pc_full < 0.8) {
+      pc_full = ipcio_percent_full(hdu_in->data_block);
+      usleep(100);
+    }
+    pc_full = 0.;
+    
+    
+    // check for dump_pending
+    if (dump_pending) {
+      
+      // look after hand trigger
+      if (specnum==0 && rz==1) {
+	
+	specnum = current_specnum + 40000;
+	
+      }
+      
+      // if this is the first block to dump
+      if (specnum > current_specnum && specnum < current_specnum+specs_per_block) {
+	
+	dumping = 1;
+	syslog(LOG_INFO,"dumping is 1 -- first block");
+	
+	// loop over beams
+	bytes_to_copy = (NSAMPS_PER_BLOCK-(specnum-current_specnum))*NCHAN_FIL;
+	bytes_copied = bytes_to_copy;
+	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
+	  
+	  start_byte = i*NSAMPS_PER_BLOCK*NCHAN_FIL + (specnum-current_specnum)*NCHAN_FIL;
+	  memcpy(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL, in_data + start_byte, bytes_to_copy);
+	  
+	}
+	
+      }
+      
+      // if this is the last block to dump from
+      if (specnum + NSAMPS_PER_BLOCK > current_specnum && specnum + NSAMPS_PER_BLOCK <= current_specnum + specs_per_block && dumping==1) {	  
+
+	syslog(LOG_INFO,"in second block");
+	
+	// loop over beams
+	bytes_to_copy = NSAMPS_PER_BLOCK*NCHAN_FIL-bytes_copied;
+	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
+	  
+	  start_byte = i*NSAMPS_PER_BLOCK*NCHAN_FIL;
+	  memcpy(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL + bytes_copied, in_data + start_byte, bytes_to_copy);
+	  
+	}
+
+	syslog(LOG_INFO,"finished copying");
+	
+	// DO THE WRITING
+
+	sprintf(dirnam,"mkdir -p %s/%s",of,footer_buf);
+	system(dirnam);
+	
+	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
+	  
+	  sprintf(foutnam,"%s/%s/%s_%d.fil",of,footer_buf,footer_buf,beamn+i);
+	  output = fopen(foutnam,"wb");
+	  
+	  send_string("HEADER_START");
+	  send_string("source_name");
+	  send_string(footer_buf);
+	  send_int("machine_id",1);
+	  send_int("telescope_id",82);
+	  send_int("data_type",1); // filterbank data
+	  send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
+	  send_double("foff",-0.244140625);
+	  send_int("nchans",1024);
+	  send_int("nbits",8);
+	  send_double("tstart",55000.0);
+	  send_double("tsamp",8.192e-6*8.*4.);
+	  send_int("nifs",1);
+	  send_string("HEADER_END");
+	  
+	  fwrite(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL,sizeof(unsigned char),NSAMPS_PER_BLOCK*NCHAN_FIL,output);
+	  
+	  fclose(output);
+	  
+	}
+	
+	syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	ofile = fopen("/home/ubuntu/data/dumps.dat","a");
+	fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	fclose(ofile);
+	
+	dumpnum++;
+	
+	// reset
+	bytes_copied = 0;
+	dump_pending = 0;
+	dumping=0;
+
+	// deal with next specnum
+	if (next_specnum != 0) {
+	  specnum = next_specnum;
+	  strcpy(footer_buf,next_footer_buf);
+	  next_specnum = 0;
+	  dump_pending = 1;
+	}
+	
+      }
+      
+      // if trigger arrived too late
+      if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
+	syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
+	
+	bytes_copied=0;
+	dump_pending=0;
+	
+      }
+      
+      
+    }
+    
+    // update current spec
+    if (DEBUG) syslog(LOG_INFO,"current_specnum %lu",current_specnum);
+    current_specnum += specs_per_block;
+    
+    
+    // for exiting
+    if (bytes_read < block_size) {
+      observation_complete = 1;
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
+    }
+    
+    // close block for reading
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+
+  }
+
+
+  // close control thread
+  syslog(LOG_INFO, "joining control_thread");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+
+  free(extData);
+  dsaX_dbgpu_cleanup (hdu_in);
+
+}
diff --git a/legacy/dsaX_fluff.c b/legacy/dsaX_fluff.c
new file mode 100644
index 0000000..3e3f2d1
--- /dev/null
+++ b/legacy/dsaX_fluff.c
@@ -0,0 +1,415 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  int n_threads;
+  int thread_id;
+  int debug;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[8] = {22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int dbg = d->debug;
+  int na = 64;
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+
+  // local array
+  int * fluffed_int = (int *)(in);
+  int * out_int = (int *)(out);
+  
+  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
+  int tile_size = 4; // set by benchmarking
+  for (int i_packet=NPACKETS*thread_id/nthreads;i_packet<NPACKETS*(thread_id+1)/nthreads;i_packet++) {
+
+    for (int i=0;i<NANTS;i+=tile_size) {
+      for (int j=0;j<384*2;j++) {
+	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
+      }
+    }
+
+  }
+
+  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: transposed",thread_id);
+
+   
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * blockie;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = block;
+      args[i].out = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].debug = 0;
+    }
+
+    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+    
+    // write to output
+    blockie = ipcio_open_block_write (hdu_out->data_block, &block_id);
+    memcpy(blockie, output_buffer, block_out);
+    ipcio_close_block_write(hdu_out->data_block, block_out);
+    
+    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    	
+    
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  
+}
+
+
diff --git a/legacy/dsaX_makeFil.c b/legacy/dsaX_makeFil.c
new file mode 100644
index 0000000..e9d6e3c
--- /dev/null
+++ b/legacy/dsaX_makeFil.c
@@ -0,0 +1,276 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block;
+  uint64_t written, block_id;
+
+
+  // set up
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+    // here is where we convert input voltage data to output filterbank data
+
+    
+    // write to output dada block
+    written = ipcio_write (hdu_out->data_block, block, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_merge.c b/legacy/dsaX_merge.c
new file mode 100644
index 0000000..7866d5f
--- /dev/null
+++ b/legacy/dsaX_merge.c
@@ -0,0 +1,580 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int DEBUG = 0;
+int STATS = 0;
+const int nth = 4;
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * in2;
+  char * out;
+  int * ant_order1;
+  int * ant_order2;
+  int n_threads;
+  int thread_id;
+};
+int cores[4] = {17, 18, 37, 38};
+
+
+void * massage (void *args) {
+
+  struct data *d = args;
+  int thread_id = d->thread_id;
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+
+  // extract from input
+  char *in = (char *)d->in;
+  char *in2 = (char *)d->in2;
+  char *out = (char *)d->out;
+  int n_threads = d->n_threads;
+  int * ao1 = d->ant_order1;
+  int * ao2 = d->ant_order2;
+
+  uint64_t oidx, iidx, ncpy = 1536;
+
+  for (int i=thread_id*(2048/n_threads);i<(thread_id+1)*(2048/n_threads);i++) {
+    for (int j=0;j<3*NSNAPS/2;j++) {
+      iidx = i*(NSNAPS/2)*4608 + j*1536;
+      oidx = i*NSNAPS*4608 + ao1[j]*1536;
+      memcpy(out + oidx, in + iidx, ncpy);
+      oidx = i*NSNAPS*4608 + ao2[j]*1536;
+      memcpy(out + oidx, in2 + iidx, ncpy); 
+    }
+  }
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_split [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -m multithread write\n"
+	   " -i in_key\n"
+	   " -o out_key\n"
+	   " -j in_key2\n"
+	   " -h print usage\n");
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_merge", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_in2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURE_BLOCK_KEY;
+  key_t out_key = CAPTURED_BLOCK_KEY;
+  key_t in_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int mwrite = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:j:dmh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key2) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'm':
+	  mwrite=1;
+	  syslog (LOG_INFO, "Will do multithread write");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  hdu_in2  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in2, in_key2);
+  if (dada_hdu_connect (hdu_in2) < 0) {
+    syslog (LOG_ERR,"could not connect to input  buffer2");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read(hdu_in2) < 0) {
+    syslog (LOG_ERR, "could not lock to input buffer2");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  header_in = ipcbuf_get_next_read (hdu_in2->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in2->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_in2,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+
+  // sort out ant order
+  int * ao1, * ao2;
+  ao1 = (int *)malloc(sizeof(int)*48);
+  ao2 = (int *)malloc(sizeof(int)*48);
+  ao1[0] = 19;
+  ao1[1] = 20;
+  ao1[2] = 21;
+  ao1[3] = 25;
+  ao1[4] = 26;
+  ao1[5] = 27;
+  ao1[6] = 18;
+  ao1[7] = 17;
+  ao1[8] = 16;
+  ao1[9] = 12;
+  ao1[10] = 11;
+  ao1[11] = 45;
+  ao1[12] = 83;
+  ao1[13] = 10;
+  ao1[14] = 9;
+  ao1[15] = 6;
+  ao1[16] = 5;
+  ao1[17] = 4;
+  ao1[18] = 0;
+  ao1[19] = 84;
+  ao1[20] = 85;
+  ao1[21] = 89;
+  ao1[22] = 90;
+  ao1[23] = 91;
+  ao1[24] = 39;
+  ao1[25] = 40;
+  ao1[26] = 41;
+  ao1[27] = 33;
+  ao1[28] = 34;
+  ao1[29] = 35;
+  ao1[30] = 42;
+  ao1[31] = 43;
+  ao1[32] = 44;
+  ao1[33] = 51;
+  ao1[34] = 52;
+  ao1[35] = 53;
+  ao1[36] = 57;
+  ao1[37] = 58;
+  ao1[38] = 59;
+  ao1[39] = 63;
+  ao1[40] = 64;
+  ao1[41] = 65;
+  ao1[42] = 69;
+  ao1[43] = 70;
+  ao1[44] = 71;
+  ao1[45] = 75;
+  ao1[46] = 76;
+  ao1[47] = 77;
+  ao2[0] = 22;
+  ao2[1] = 23;
+  ao2[2] = 24;
+  ao2[3] = 28;
+  ao2[4] = 29;
+  ao2[5] = 30;
+  ao2[6] = 15;
+  ao2[7] = 14;
+  ao2[8] = 13;
+  ao2[9] = 46;
+  ao2[10] = 47;
+  ao2[11] = 48;
+  ao2[12] = 82;
+  ao2[13] = 8;
+  ao2[14] = 7;
+  ao2[15] = 3;
+  ao2[16] = 2;
+  ao2[17] = 1;
+  ao2[18] = 86;
+  ao2[19] = 87;
+  ao2[20] = 88;
+  ao2[21] = 92;
+  ao2[22] = 93;
+  ao2[23] = 94;
+  ao2[24] = 95;
+  ao2[25] = 31;
+  ao2[26] = 32;
+  ao2[27] = 36;
+  ao2[28] = 37;
+  ao2[29] = 38;
+  ao2[30] = 81;
+  ao2[31] = 49;
+  ao2[32] = 50;
+  ao2[33] = 54;
+  ao2[34] = 55;
+  ao2[35] = 56;
+  ao2[36] = 60;
+  ao2[37] = 61;
+  ao2[38] = 62;
+  ao2[39] = 66;
+  ao2[40] = 67;
+  ao2[41] = 68;
+  ao2[42] = 72;
+  ao2[43] = 73;
+  ao2[44] = 74;
+  ao2[45] = 78;
+  ao2[46] = 79;
+  ao2[47] = 80;
+
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block1, * block2, * o1, * o2;
+  char * output = (char *)malloc(sizeof(char)*block_out);
+  uint64_t written, block_id;
+
+  // set up threads
+  struct data args[8];
+  pthread_t threads[8];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  // send through fake blocks
+
+  /*  if (fake>0) {
+    syslog(LOG_INFO,"sending %d fake blocks",fake);
+    for (int i=0;i<fake;i++) {
+      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
+      memcpy(o1, output, block_out);
+      ipcio_close_block_write (hdu_out->data_block, block_out);
+      usleep(10000);
+    }
+    syslog(LOG_INFO,"Finished with fake blocks");
+    }*/
+  
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    
+    block1 = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    block2 = ipcio_open_block_read (hdu_in2->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    
+    // DO STUFF
+
+    // copy to output buffer
+    
+    if (mwrite) {
+      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
+    }
+    
+    // set up data structure
+    for (int i=0; i<nth; i++) {
+      args[i].in = block1;
+      args[i].in2 = block2;
+      args[i].ant_order1 = ao1;
+      args[i].ant_order2 = ao2;
+      
+      if (mwrite) 
+	args[i].out = o1;	
+      else
+	args[i].out = output;
+
+      args[i].n_threads = nth;
+      args[i].thread_id = i;
+    }
+    
+    //syslog(LOG_INFO, "creating threads");
+    
+    for(int i=0; i<nth; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+    
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nth; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+    
+    
+    if (!mwrite) {
+      written = ipcio_write (hdu_out->data_block, output, block_out);
+    }
+    else {
+      ipcio_close_block_write (hdu_out->data_block, block_out);
+    }
+
+    if (blocks % 10 == 0)
+      syslog(LOG_INFO, "written block %d",blocks);      
+    blocks++;
+    
+    
+    if (bytes_read < block_size)
+      observation_complete = 1;            
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    ipcio_close_block_read (hdu_in2->data_block, bytes_read);
+
+  }
+
+  free(output);
+  free(ao1);
+  free(ao2);
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_in2,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  
+}
+
+
diff --git a/legacy/dsaX_nicdb.c b/legacy/dsaX_nicdb.c
new file mode 100644
index 0000000..df47ebe
--- /dev/null
+++ b/legacy/dsaX_nicdb.c
@@ -0,0 +1,483 @@
+/*
+https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading
+
+gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran
+
+the plan is to have NCLIENTS threads listening on different threads. 
+each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints
+the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly
+The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. 
+
+After a block is full, the data need to be written out (data rate 525 Mb/s)
+The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. 
+switch block when one block is being written out
+
+*/
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#define bdepth 16
+#define MAX_FULLBLOCK 4
+
+// global variables
+int DEBUG = 0;
+volatile int blockct[bdepth]; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW
+volatile int flush_flag = 0; // set to flush output2
+volatile int writing = 0;
+volatile int global_tseq = 0; // global count of full buffers
+int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to
+char iP[100];
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;	  
+
+// structure to pass to threads
+struct data
+{
+  char * output1;
+  char * output2;
+  uint16_t tport;
+  int thread_id;
+};
+
+// function prototypes
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+
+// receive process - runs infinite loop
+void * process(void * ptr)
+{
+
+  // arguments from structure
+  struct data *d = ptr;
+  int thread_id = d->thread_id;
+  char *output1 = (char *)d->output1;
+  char *output2 = (char *)d->output2;
+  uint16_t tport = d->tport;
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
+
+  // set up socket
+  struct sockaddr_in si_other, si_me;
+  int clientSocket, slen=sizeof(si_other);
+  clientSocket=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  if (DEBUG) syslog(LOG_INFO,"thread %d: Made socket",thread_id);
+  memset((char *) &si_me, 0, sizeof(si_me));
+  si_me.sin_family = AF_INET;
+  si_me.sin_port = htons(tport);
+  si_me.sin_addr.s_addr = inet_addr(iP);
+  if (bind(clientSocket, (struct sockaddr *)&si_me, sizeof(si_me)) < 0) {
+    syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id);
+    exit(1);
+  }
+  if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound - waiting for header packet",thread_id);
+
+  char * packet = (char *)malloc(sizeof(char)*P_SIZE);
+  int * ibuf;
+  recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen);
+  ibuf = (int *)(packet);
+  int chgroup = ibuf[0];
+  syslog(LOG_INFO,"thread %d: accepted connection from chgroup %d",thread_id,chgroup);
+
+  // data buffer and other variables
+  char * buffer = (char *)malloc((NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
+  int tseq, pseq;
+  int pct = 0;
+  int full_blocks = 0;
+  int fullBlock;
+  int i0, aa;
+  int lastPacket, nextBuf, current_tseq = 0, act_tseq; 
+  uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+  uint64_t oidx_offset, oidx;
+  
+  // infinite loop 
+  while (1) {
+  
+    /* read message */
+    // fill up local buffer
+    lastPacket = 0;
+    nextBuf = 0;
+    while ((lastPacket==0) && (nextBuf==0)) {
+
+      recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen);
+      ibuf = (int *)(packet);
+      pseq = ibuf[2];
+      if (chgroup != ibuf[0]) 
+	syslog(LOG_ERR,"thread %d: received chgroup %d is not recorded %d",thread_id,ibuf[0],chgroup);
+      tseq = ibuf[1];
+
+      if (tseq>current_tseq) {
+	nextBuf=1;
+      }
+      else if (tseq==current_tseq) {
+	memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12);
+	pct++;
+      }
+
+      if (pseq==NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)-1)
+	lastPacket=1;
+
+    }
+    
+    if (pct != NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12))
+      syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,pct,NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12));
+    
+    act_tseq = (current_tseq * NSAMPS_PER_TRANSMIT) % NSAMPS_PER_BLOCK; // place within output buffer
+
+    // at this stage we have a full local buffer
+    // this needs to be placed in the global buffer
+      
+    // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs
+    i0 = 0;
+    aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth);
+    oidx_offset = ((uint64_t)(aa))*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+    //syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d current_tseq %d global_tseq %d position %d %"PRIu64"",thread_id,chgroup,tseq,current_tseq,global_tseq,aa,oidx_offset);
+    for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
+      for (int j=0;j<NSAMPS_PER_TRANSMIT;j++) {	
+	for (int k=0;k<NW;k++) {
+	  
+	  oidx = oidx_offset + i*NSAMPS_PER_BLOCK*NCHAN_FIL + (act_tseq+j)*NCHAN_FIL + CHOFF/8 + chgroup*NW + k;
+	  
+	  output1[oidx] = buffer[i0];
+
+	  i0++;
+	    
+	}
+      }
+    }
+    //syslog(LOG_INFO,"thread %d: entering mutex",thread_id);
+
+    // at this stage we have dealt with this capture round, and must address blockct within mutex
+    pthread_mutex_lock(&mutex);
+
+    // increment appropriate blockct
+    aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth);
+    blockct[aa] += 1;
+    //syslog(LOG_INFO,"thread %d: incrementing blockct %d %d %d (total %d)",thread_id,current_tseq,aa,blockct[aa],NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT);
+
+    // deal with full block anywhere
+    full_blocks=0;
+    for (int i=0;i<bdepth;i++) {
+      if (blockct[i]!=0) full_blocks++;
+    }	
+    for (int i=0;i<bdepth;i++) {
+      if ((blockct[i] == NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) || (full_blocks>=MAX_FULLBLOCK && blockct[i] >= (NCLIENTS-1)*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) {
+
+	// need to write this block and reset blockct
+	while (flush_flag==1)
+	  aa==1;
+	flush_flag = 1;
+	blockct[i] = 0;
+	// log - hardcoded bdepth
+	full_blocks -= 1;
+	syslog(LOG_INFO,"thread %d: Writing global_tseq %d. Blockcts_full %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",thread_id,global_tseq,full_blocks,blockct[0],blockct[1],blockct[2],blockct[3],blockct[4],blockct[5],blockct[6],blockct[7],blockct[8],blockct[9],blockct[10],blockct[11],blockct[12],blockct[13],blockct[14],blockct[15]);
+
+	
+      }	
+
+    }
+        
+    pthread_mutex_unlock(&mutex);
+
+    // advance local tseq and deal with packet capture
+    if (lastPacket==1) {
+      current_tseq++;
+      lastPacket=0;
+      nextBuf=0;
+      pct=0;
+    }
+    if (nextBuf==1) {
+      current_tseq++;
+      memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12);
+      pct=1;
+      lastPacket=0;
+    }
+
+    
+
+  }
+
+  /* close socket and clean up */
+  close(clientSocket);
+  free(packet);
+  free(buffer);
+  pthread_exit(0);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_nicdb [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -f header file [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n"
+	   " -i IP address\n"
+	   " -h print usage\n");
+}
+
+
+// main part of program 
+int main(int argc, char ** argv)
+{
+    
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  for (int i=0;i<bdepth;i++) blockct[i] = 0;
+
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t out_key = BEAMCAPTURE_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  char fnam[200];
+  
+  while ((arg=getopt(argc,argv,"c:f:o:i:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  // DADA stuff
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  // deal with headers
+  uint64_t header_size = 4096;
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  FILE *fin;
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"cannot open dada header file %s",fnam);
+    return EXIT_FAILURE;
+  }
+  fread(header_out, 4096, 1, fin);
+  fclose(fin);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }  
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+    
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out);
+  uint64_t  bytes_read = 0;
+  char *output1, *output2;
+  output1 = (char *)malloc(sizeof(char)*block_out*bdepth);
+  output2 = (char *)malloc(sizeof(char)*block_out);
+  memset(output1,0,block_out*bdepth);
+  memset(output2,0,block_out);
+  uint64_t written, block_id;
+
+  // set up threads
+  
+  // set up data structure
+  for (int i=0; i<NCLIENTS; i++) {
+    args[i].output1 = output1;
+    args[i].output2 = output2;
+    args[i].thread_id = i;
+    args[i].tport = FIL_PORT0 + (uint16_t)(i);
+  }
+
+  if (DEBUG) syslog(LOG_INFO,"creating %d threads (one per client)",NCLIENTS);
+    
+  for(int i=0; i<NCLIENTS; i++){
+    if (pthread_create(&threads[i], &attr, &process, (void *)(&args[i]))) {
+      syslog(LOG_ERR,"Failed to create thread %d\n", i);
+    }
+  }
+  pthread_attr_destroy(&attr);
+  if (DEBUG) syslog(LOG_INFO,"threads kinda running");
+  
+  int observation_complete=0;
+  int blocks = 0;
+  int aa;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // look for complete block
+
+    //if (DEBUG) syslog(LOG_INFO,"here with %d",blockct);
+    while (flush_flag==0)
+      aa=1;
+
+    // write to output
+    writing=1;
+    written = ipcio_write (hdu_out->data_block, output1 + (global_tseq % bdepth)*block_out, block_out);
+    global_tseq += 1;
+    writing=0;
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");	
+	dsaX_dbgpu_cleanup (hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    syslog(LOG_INFO, "written block %d",blocks);      
+    blocks++;
+
+    flush_flag = 0;
+
+  }
+      
+  
+  // free stuff
+  for(int i=0; i<NCLIENTS; i++){
+    pthread_join(threads[i], &result);
+    if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
+  }
+  free(output1);
+  free(output2);
+  dsaX_dbgpu_cleanup(hdu_out);
+  
+}
diff --git a/legacy/dsaX_nicdb.c.bak b/legacy/dsaX_nicdb.c.bak
new file mode 100644
index 0000000..b309424
--- /dev/null
+++ b/legacy/dsaX_nicdb.c.bak
@@ -0,0 +1,434 @@
+/*
+https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading
+
+gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran
+
+the plan is to have NCLIENTS threads listening on different threads. 
+each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints
+the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly
+The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. 
+
+After a block is full, the data need to be written out (data rate 525 Mb/s)
+The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. 
+switch block when one block is being written out
+
+*/
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+int blockct = 0; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW
+int block_switch = 0; // 0 means write to output1, write out output2.
+int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to
+char iP[100];
+
+// structure to pass to threads
+struct data
+{
+  char * output1;
+  char * output2;
+  uint16_t tport;
+  int thread_id;
+};
+
+// function prototypes
+void dsaX_dbgpu_cleanup (dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+
+// receive process - runs infinite loop
+void * process(void * ptr)
+{
+
+  // arguments from structure
+  struct data *d = ptr;
+  int thread_id = d->thread_id;
+  char *output1 = (char *)d->output1;
+  char *output2 = (char *)d->output2;
+  uint16_t tport = d->tport;
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
+
+  // set up socket
+  int sock = -1, conn = -1;
+  struct sockaddr_in address, cli;
+
+  /* create socket */
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  if (DEBUG) syslog(LOG_INFO,"thread %d: opened socket",thread_id);
+  memset(&address, 0, sizeof(struct sockaddr_in));
+  address.sin_family = AF_INET;
+  inet_pton(AF_INET, iP, &(address.sin_addr));
+  //address.sin_addr.s_addr = inet_addr("127.0.0.1");
+  address.sin_port = htons(tport);
+  if (DEBUG) syslog(LOG_INFO,"thread %d: socket ready",thread_id);
+  if (bind(sock, (struct sockaddr *)&address, sizeof(struct sockaddr_in)) < 0) {
+    syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id);
+    exit(1);
+  }
+  if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound",thread_id);
+  listen(sock, 5);
+  if (DEBUG) syslog(LOG_INFO,"thread %d: socket listening on port %d",thread_id,tport);
+  
+  // accept connection
+  socklen_t cli_len=sizeof(struct sockaddr);
+  conn = accept(sock, (struct sockaddr *) &cli, &cli_len);
+  if (conn<0) {
+    syslog(LOG_ERR,"thread %d: error accepting connection",thread_id);
+    exit(1);
+  }
+  syslog(LOG_INFO,"thread %d: accepted connection",thread_id);
+
+  // data buffer and other variables
+  char * buffer = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
+  char * dblock = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
+  int *ibuf, chgroup, tseq, oidx, iidx;
+  int remain_data, outptr, len;
+  int i0;
+  
+  // infinite loop 
+  while (1) {
+  
+    /* read message */
+    // read to buffer until all is read
+    remain_data =(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW);
+    outptr=0;
+
+    /*
+    while (((len = recv(conn, dblock, remain_data, 0)) > 0) && (remain_data > 0)) {
+    memcpy(buffer+outptr, dblock, len);
+      remain_data -= len;
+      outptr += len;
+      //syslog(LOG_INFO,"Received %d of %d bytes",outptr,8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW);
+      }*/
+    //recvlen = read(sock, buffer, sizeof(buffer));
+    ibuf = (int *)(buffer);
+    len = recv(conn, dblock, remain_data, MSG_WAITALL);
+    memcpy(buffer, dblock, len);
+    remain_data -= len;
+    if (remain_data != 0)
+      syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,len,(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
+    
+    if (remain_data==0) {
+    
+      // get channel group and time sequence
+      chgroup = ibuf[0]; // from 0-15
+      tseq = ibuf[1]; // continuous iterate over transmits
+      if (DEBUG) syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d blockct %d",thread_id,chgroup,tseq,blockct);
+      tseq = (tseq * 128) % 4096; // place within output
+      
+      // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs
+      i0 = 8;
+      for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
+	for (int j=0;j<NSAMPS_PER_TRANSMIT;j++) {	
+	  for (int k=0;k<NW;k++) {
+	    
+	    oidx = i*NSAMPS_PER_BLOCK*NCHAN_FIL + (tseq+j)*NCHAN_FIL + CHOFF/8 + chgroup*NW + k;
+	    //iidx = 8 + i0;
+	    
+	    if (block_switch==0) output1[oidx] = buffer[i0];
+	    if (block_switch==1) output2[oidx] = buffer[i0];
+
+	    i0++;
+	    
+	  }
+	}
+      }
+      
+      // iterate blockct
+      blockct++;
+
+    }
+
+  }
+
+  /* close socket and clean up */
+  close(sock);
+  free(buffer);
+  free(dblock);
+  pthread_exit(0);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_nicdb [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -f header file [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n"
+	   " -i IP address\n"
+	   " -h print usage\n");
+}
+
+
+// main part of program 
+int main(int argc, char ** argv)
+{
+    
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t out_key = BEAMCAPTURE_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  char fnam[200];
+  
+  while ((arg=getopt(argc,argv,"c:f:o:i:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  // DADA stuff
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  // deal with headers
+  uint64_t header_size = 4096;
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  FILE *fin;
+  if (!(fin=fopen(fnam,"rb"))) {
+    syslog(LOG_ERR,"cannot open dada header file %s",fnam);
+    return EXIT_FAILURE;
+  }
+  fread(header_out, 4096, 1, fin);
+  fclose(fin);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }  
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_out);
+      return EXIT_FAILURE;
+    }
+    
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out);
+  uint64_t  bytes_read = 0;
+  char *output1, *output2;
+  output1 = (char *)malloc(sizeof(char)*block_out);
+  output2 = (char *)malloc(sizeof(char)*block_out);
+  memset(output1,0,block_out);
+  memset(output2,0,block_out);
+  uint64_t written, block_id;
+
+  // set up threads
+  
+  // set up data structure
+  for (int i=0; i<NCLIENTS; i++) {
+    args[i].output1 = output1;
+    args[i].output2 = output2;
+    args[i].thread_id = i;
+    args[i].tport = FIL_PORT0 + (uint16_t)(i);
+  }
+
+  if (DEBUG) syslog(LOG_INFO,"creating %d threads (one per client)",NCLIENTS);
+    
+  for(int i=0; i<NCLIENTS; i++){
+    if (pthread_create(&threads[i], &attr, &process, (void *)(&args[i]))) {
+      syslog(LOG_ERR,"Failed to create thread %d\n", i);
+    }
+  }
+  pthread_attr_destroy(&attr);
+  if (DEBUG) syslog(LOG_INFO,"threads kinda running");
+  
+  int observation_complete=0;
+  int blocks = 0;
+  int ctt;
+  int bswitch;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // look for complete block
+
+    //if (DEBUG) syslog(LOG_INFO,"here with %d",blockct);
+    usleep(10);
+
+    if (blockct>=NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) {      
+      
+      // change output
+      bswitch= block_switch;
+      blockct=0;
+      if (bswitch==0) block_switch=1;
+      if (bswitch==1) block_switch=0;
+
+      // write to output
+      if (bswitch==0) written = ipcio_write (hdu_out->data_block, output1, block_out);
+      if (bswitch==1) written = ipcio_write (hdu_out->data_block, output2, block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");	
+	  dsaX_dbgpu_cleanup (hdu_out);
+	  return EXIT_FAILURE;
+	}
+
+      if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
+      blocks++;
+      ctt=0;
+    }
+      
+  }
+  
+  // free stuff
+  for(int i=0; i<NCLIENTS; i++){
+    pthread_join(threads[i], &result);
+    if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
+  }
+  free(output1);
+  free(output2);
+  dsaX_dbgpu_cleanup(hdu_out);
+  
+}
diff --git a/legacy/dsaX_reorder.c b/legacy/dsaX_reorder.c
new file mode 100644
index 0000000..04955da
--- /dev/null
+++ b/legacy/dsaX_reorder.c
@@ -0,0 +1,515 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  int n_threads;
+  int thread_id;
+  int debug;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int dbg = d->debug;
+   
+  // masks for fluffing
+  __m512i masks[4];
+  masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL);
+  masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL);
+  masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL);
+  masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL);
+
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+
+  /* DO ALL PROCESSING
+   
+     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
+     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
+     parallelize by splitting on NPACKETS axis. 
+
+   */
+
+  // input and output index and extracted data
+  int idx = thread_id; // PACKET idx for input and output
+  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
+  char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
+  
+  // extract data
+  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
+  if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: extracted data",thread_id);
+  
+  // do fluffing
+
+  /* 
+     technique is to use nybble masks to 
+     (a) unmask every fourth nybble
+     (b) bit shift to left using mm512_slli_epi16
+     (c) sign extend by 4 bits using mm512_srai_epi16
+     (d) bit shift to right
+
+     Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data
+
+   */
+
+  // variables
+  char * low = (char *)malloc(sizeof(char)*64); // m512
+  char * hi = (char *)malloc(sizeof(char)*64); // m512
+  __m512i low_m, hi_m;
+  unsigned short * low_u = (unsigned short *)(low);
+  unsigned short * hi_u = (unsigned short *)(hi);
+  __m512i v[4]; // for 4 packed 4-bit numbers
+
+  // input and output
+  __m512i proc_m;
+  unsigned short * fluffed_u = (unsigned short *)(fluffed_data);
+
+  // numbers to iterate over
+  int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64;
+
+  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: ready to fluff",thread_id);
+  
+  // let's do it!
+  for (int i=0;i<n_512;i++) { // loop over lots of 512 bits
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: beginning fluff %d",thread_id,i);
+
+    // get input data
+    proc_m = _mm512_loadu_si512((proc_data+i*64));
+    if (dbg) syslog(LOG_DEBUG,"thread %d: copied data %d",thread_id,i);
+    
+    // retrieve masks
+    for (int j=0;j<4;j++) {
+      v[j] = _mm512_and_si512(proc_m, masks[j]);
+    }
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: masked %d",thread_id,i);
+    
+    // do in place fluffing
+    v[0] = _mm512_slli_epi16(v[0], 12);
+    v[0] = _mm512_srai_epi16(v[0], 4);
+    v[0] = _mm512_srli_epi16(v[0], 8);
+
+    v[1] = _mm512_slli_epi16(v[1], 8);
+    v[1] = _mm512_srai_epi16(v[1], 4);
+
+    v[2] = _mm512_slli_epi16(v[2], 4);
+    v[2] = _mm512_srai_epi16(v[2], 4);
+    v[2] = _mm512_srli_epi16(v[2], 8);
+
+    v[3] = _mm512_srai_epi16(v[3], 4);
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: in place %d",thread_id,i);
+
+    // make lower and upper 
+    low_m = _mm512_or_si512(v[0], v[1]);
+    hi_m = _mm512_or_si512(v[2], v[3]);
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: lower and upper %d",thread_id,i);
+
+    // copy back to bytes
+    _mm512_storeu_si512((__m512i *) &low[0], low_m);
+    _mm512_storeu_si512((__m512i *) &hi[0], hi_m);
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: copied lower and upper %d",thread_id,i);
+    
+    // extract from lower and upper into fluffed
+    // there are 32 2-byte unsigned shorts in each of low and hi
+    for (int j=0;j<32;j++) {
+      fluffed_u[i*64+j*2] = low_u[j];
+      fluffed_u[i*64+j*2+1] = hi_u[j];
+    }
+
+    if (dbg) syslog(LOG_DEBUG,"thread %d: extracted %d",thread_id,i);
+    
+  }
+
+  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: fluffed",thread_id);
+
+  memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*NANTS*2*2,fluffed_data,(NPACKETS/nthreads)*(384*2)*NANTS*2*2);
+  
+  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id);
+  
+  // free stuff
+  free(proc_data);
+  free(fluffed_data);
+  free(low);
+  free(hi);
+  
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      return EXIT_FAILURE;
+    }
+
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = block;
+      args[i].out = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].debug = 0;
+    }
+
+    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+    
+    // write to output
+
+    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    	
+    
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  
+}
+
+
diff --git a/legacy/dsaX_reorder_raw.c b/legacy/dsaX_reorder_raw.c
new file mode 100644
index 0000000..c0f6b0c
--- /dev/null
+++ b/legacy/dsaX_reorder_raw.c
@@ -0,0 +1,613 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  int n_threads;
+  int thread_id;
+  int debug;
+  int write;
+  ipcio_t * ipc;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -b connect to bf hdu\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int na = 64; // output ants
+  int dbg = d->debug;
+     
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+
+  /* DO ALL PROCESSING
+   
+     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
+     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
+     parallelize by splitting on NPACKETS axis. 
+
+   */
+
+  // input and output index and extracted data
+  int idx = thread_id; // PACKET idx for input and output
+  //char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
+  //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
+  //char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
+  
+  // extract data
+  //memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
+  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
+  
+  // do fluffing in dumbest possible way
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
+  
+  // let's do it!
+  int in_idx, out_idx, a1, a2, a3, a4, a5, a6;
+  int in_offset = idx*(NPACKETS/nthreads)*NANTS*(384*2)*2;
+  int out_offset = idx*(NPACKETS/nthreads)*(384*2)*na*2;
+  for (int i=0;i<(NPACKETS/nthreads);i++) {
+    a1 = i*NANTS*1536;
+    a2 = i*na*1536;
+    for (int j=0;j<NANTS;j++) {
+      for (int k=0;k<768;k++) {
+	for (int l=0;l<2;l++) {
+
+	  in_idx = a1+j*1536+k*2+l;
+	  out_idx = a2+k*na*2+j*2+l;
+
+	  d->ipc->curbuf[out_offset+out_idx] = in[in_offset+in_idx];
+	  //d->ipc->curbuf[out_offset+2*out_idx+1] = in[in_offset+in_idx] >> 4;
+
+	}
+      }
+    }
+  }
+  
+  /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data
+
+    fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4;
+    fluffed_data[2*i+1] = proc_data[i] >> 4;
+    
+    }*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
+  
+  // transpose antennas and frequencies by ints
+  // from fluffed_data to out_data
+  /* int * fluffed_int = (int *)(fluffed_data);
+  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  int * out_int = (int *)out_data;*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
+
+  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
+  /*  int tile_size = 3; // set by benchmarking
+  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
+
+    for (int i=0;i<NANTS;i+=tile_size) {
+      for (int j=0;j<384*2;j++) {
+	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
+      }
+    }
+
+    }*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
+  
+  // place in out
+  /*  if (d->write)
+    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  else
+    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  */
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
+  
+  // free stuff
+  //free(proc_data);
+  //free(fluffed_data);
+  //free(out_data);
+  
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+
+  // run test with single thread
+
+  /*syslog(LOG_INFO,"Running TEST...\n");
+  
+  // set up data structure
+  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
+  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
+  memset(test_block,0,sizeof(test_block));
+  
+   TEST CODE 
+  FILE *fin;
+  fin=fopen("../utils/packet.out","rb");
+  fread(test_block, 96768, 1, fin);
+  fclose(fin);
+   END TEST CODE 
+  
+  args[0].in = test_block;
+  args[0].out = test_output;
+  args[0].n_threads = 1;
+  args[0].thread_id = 0;
+  args[0].debug = 0;
+  args[0].write = 0;
+
+  // run test thread
+  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
+    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
+  }
+  else
+    syslog(LOG_INFO,"Created TEST thread\n");
+  pthread_attr_destroy(&attr);    
+  pthread_join(threads[0], &result);
+  syslog(LOG_INFO,"joined TEST thread");
+
+   TEST CODE 
+  fin=fopen("../utils/test.out","wb");
+  fwrite(test_output, 1, 196608, fin);
+  fclose(fin);
+  END TEST CODE 
+  
+  // clean up
+  free(test_block);
+  free(test_output);
+
+  syslog(LOG_INFO,"TEST COMPLETE");*/
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (bf) {
+    hdu_out2  = dada_hdu_create (0);
+    dada_hdu_set_key (hdu_out2, out_key2);
+    if (dada_hdu_connect (hdu_out2) < 0) {
+      syslog (LOG_ERR,"could not connect to output  buffer2");
+      return EXIT_FAILURE;
+    }
+    if (dada_hdu_lock_write(hdu_out2) < 0) {
+      syslog (LOG_ERR, "could not lock to output buffer2");
+      return EXIT_FAILURE;
+    }
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * blockie;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // sort out write
+    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 0;      
+    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
+    
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = block;
+      args[i].out = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].debug = 0;
+      args[i].ipc = hdu_out->data_block;
+      args[i].write = 1;
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
+    }
+    
+    // write to output
+
+    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    
+    if (bf) {
+
+      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	  dsaX_dbgpu_cleanup (hdu_in,0);
+	  dsaX_dbgpu_cleanup (hdu_out,1);
+	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	  return EXIT_FAILURE;
+	}
+
+    }
+
+    // finish write
+    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
+    ipcio_check_pending_sod (hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 1;      
+    //ipcio_close_block_write(hdu_out->data_block, block_out);
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+  
+}
+
+
diff --git a/legacy/dsaX_reorder_raw.c.bak b/legacy/dsaX_reorder_raw.c.bak
new file mode 100644
index 0000000..0914823
--- /dev/null
+++ b/legacy/dsaX_reorder_raw.c.bak
@@ -0,0 +1,672 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  int n_threads;
+  int thread_id;
+  int debug;
+  int write;
+  ipcio_t * ipc;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -b connect to bf hdu\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int na = 64; // output ants
+  int dbg = d->debug;
+   
+  // masks for fluffing
+  __m512i masks[4];
+  masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL);
+  masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL);
+  masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL);
+  masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL);
+
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+
+  /* DO ALL PROCESSING
+   
+     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
+     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
+     parallelize by splitting on NPACKETS axis. 
+
+   */
+
+  // input and output index and extracted data
+  int idx = thread_id; // PACKET idx for input and output
+  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
+  char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
+  char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
+  
+  // extract data
+  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
+  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
+  
+  // do fluffing
+
+  /* 
+     technique is to use nybble masks to 
+     (a) unmask every fourth nybble
+     (b) bit shift to left using mm512_slli_epi16
+     (c) sign extend by 4 bits using mm512_srai_epi16
+     (d) bit shift to right
+
+     Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data
+
+   */
+
+  // variables
+  char * low = (char *)malloc(sizeof(char)*64); // m512
+  char * hi = (char *)malloc(sizeof(char)*64); // m512
+  __m512i low_m, hi_m;
+  unsigned short * low_u = (unsigned short *)(low);
+  unsigned short * hi_u = (unsigned short *)(hi);
+  __m512i v[4]; // for 4 packed 4-bit numbers
+
+  // input and output
+  __m512i proc_m;
+  unsigned short * fluffed_u = (unsigned short *)(fluffed_data);
+
+  // numbers to iterate over
+  int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64;
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
+  
+  // let's do it!
+  for (int i=0;i<n_512;i++) { // loop over lots of 512 bits
+
+    if (dbg) syslog(LOG_INFO,"thread %d: beginning fluff %d",thread_id,i);
+
+    // get input data
+    proc_m = _mm512_loadu_si512((proc_data+i*64));
+    if (dbg) syslog(LOG_INFO,"thread %d: copied data %d",thread_id,i);
+    
+    // retrieve masks
+    for (int j=0;j<4;j++) {
+      v[j] = _mm512_and_si512(proc_m, masks[j]);
+    }
+
+    if (dbg) syslog(LOG_INFO,"thread %d: masked %d",thread_id,i);
+    
+    // do in place fluffing
+    v[0] = _mm512_slli_epi16(v[0], 12);
+    v[0] = _mm512_srai_epi16(v[0], 4);
+    v[0] = _mm512_srli_epi16(v[0], 8);
+
+    v[1] = _mm512_slli_epi16(v[1], 8);
+    v[1] = _mm512_srai_epi16(v[1], 4);
+
+    v[2] = _mm512_slli_epi16(v[2], 4);
+    v[2] = _mm512_srai_epi16(v[2], 4);
+    v[2] = _mm512_srli_epi16(v[2], 8);
+
+    v[3] = _mm512_srai_epi16(v[3], 4);
+
+    if (dbg) syslog(LOG_INFO,"thread %d: in place %d",thread_id,i);
+
+    // make lower and upper 
+    low_m = _mm512_or_si512(v[0], v[1]);
+    hi_m = _mm512_or_si512(v[2], v[3]);
+
+    if (dbg) syslog(LOG_INFO,"thread %d: lower and upper %d",thread_id,i);
+
+    // copy back to bytes
+    _mm512_storeu_si512((__m512i *) &low[0], low_m);
+    _mm512_storeu_si512((__m512i *) &hi[0], hi_m);
+
+    if (dbg) syslog(LOG_INFO,"thread %d: copied lower and upper %d",thread_id,i);
+    
+    // extract from lower and upper into fluffed
+    // there are 32 2-byte unsigned shorts in each of low and hi
+    for (int j=0;j<32;j++) {
+      fluffed_u[i*64+j*2] = low_u[j];
+      fluffed_u[i*64+j*2+1] = hi_u[j];
+    }
+
+    if (dbg) syslog(LOG_INFO,"thread %d: extracted %d",thread_id,i);
+    
+  }
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
+  
+  // transpose antennas and frequencies by ints
+  // from fluffed_data to out_data
+  int * fluffed_int = (int *)(fluffed_data);
+  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  int * out_int = (int *)out_data;
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
+
+  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
+  int tile_size = 7; // set by benchmarking
+  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
+
+    for (int i=0;i<NANTS;i+=tile_size) {
+      for (int j=0;j<384*2;j++) {
+	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
+      }
+    }
+
+  }
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
+  
+  // place in out
+  if (d->write)
+    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  else
+    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
+  
+  // free stuff
+  free(proc_data);
+  free(fluffed_data);
+  free(out_data);
+  free(low);
+  free(hi);
+  
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+
+  // run test with single thread
+
+  syslog(LOG_INFO,"Running TEST...\n");
+  
+  // set up data structure
+  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
+  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
+  memset(test_block,0,sizeof(test_block));
+  
+  /* TEST CODE 
+  FILE *fin;
+  fin=fopen("../utils/packet.out","rb");
+  fread(test_block, 96768, 1, fin);
+  fclose(fin);
+   END TEST CODE */
+  
+  args[0].in = test_block;
+  args[0].out = test_output;
+  args[0].n_threads = 1;
+  args[0].thread_id = 0;
+  args[0].debug = 0;
+  args[0].write = 0;
+
+  // run test thread
+  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
+    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
+  }
+  else
+    syslog(LOG_INFO,"Created TEST thread\n");
+  pthread_attr_destroy(&attr);    
+  pthread_join(threads[0], &result);
+  syslog(LOG_INFO,"joined TEST thread");
+
+  /* TEST CODE 
+  fin=fopen("../utils/test.out","wb");
+  fwrite(test_output, 1, 196608, fin);
+  fclose(fin);
+  END TEST CODE */
+  
+  // clean up
+  free(test_block);
+  free(test_output);
+
+  syslog(LOG_INFO,"TEST COMPLETE");
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (bf) {
+    hdu_out2  = dada_hdu_create ();
+    dada_hdu_set_key (hdu_out2, out_key2);
+    if (dada_hdu_connect (hdu_out2) < 0) {
+      syslog (LOG_ERR,"could not connect to output  buffer2");
+      return EXIT_FAILURE;
+    }
+    if (dada_hdu_lock_write(hdu_out2) < 0) {
+      syslog (LOG_ERR, "could not lock to output buffer2");
+      return EXIT_FAILURE;
+    }
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * blockie;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // sort out write
+    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 0;      
+    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
+    
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = block;
+      args[i].out = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].debug = 0;
+      args[i].ipc = hdu_out->data_block;
+      args[i].write = 1;
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
+    }
+    
+    // write to output
+
+    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    
+    if (bf) {
+
+      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	  dsaX_dbgpu_cleanup (hdu_in,0);
+	  dsaX_dbgpu_cleanup (hdu_out,1);
+	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	  return EXIT_FAILURE;
+	}
+
+    }
+
+    // finish write
+    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
+    ipcio_check_pending_sod (hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 1;      
+    //ipcio_close_block_write(hdu_out->data_block, block_out);
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+  
+}
+
+
diff --git a/legacy/dsaX_reorder_raw.c.bak2 b/legacy/dsaX_reorder_raw.c.bak2
new file mode 100644
index 0000000..54ad886
--- /dev/null
+++ b/legacy/dsaX_reorder_raw.c.bak2
@@ -0,0 +1,608 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  int n_threads;
+  int thread_id;
+  int debug;
+  int write;
+  ipcio_t * ipc;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -b connect to bf hdu\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+  int na = 64; // output ants
+  int dbg = d->debug;
+     
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+
+  /* DO ALL PROCESSING
+   
+     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
+     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
+     parallelize by splitting on NPACKETS axis. 
+
+   */
+
+  // input and output index and extracted data
+  int idx = thread_id; // PACKET idx for input and output
+  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
+  //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
+  char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
+  
+  // extract data
+  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
+  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
+  
+  // do fluffing in dumbest possible way
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
+  
+  // let's do it!
+  int in_idx, out_idx, a1, a2, a3, a4, a5, a6;
+  for (int i=0;i<(NPACKETS/nthreads);i++) {
+    a1 = i*NANTS*1536;
+    a2 = i*na*1536;
+    for (int j=0;j<NANTS;j++) {
+      for (int k=0;k<768;k++) {
+	for (int l=0;l<2;l++) {
+
+	  in_idx = a1+j*1536+k*2+l;
+	  out_idx = a2+k*na*2+j*2+l;
+
+	  out_data[2*out_idx] = ((proc_data[in_idx]<<4) & 240) >> 4;
+	  out_data[2*out_idx+1] = proc_data[in_idx] >> 4;
+
+	}
+      }
+    }
+  }
+  
+  /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data
+
+    fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4;
+    fluffed_data[2*i+1] = proc_data[i] >> 4;
+    
+    }*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
+  
+  // transpose antennas and frequencies by ints
+  // from fluffed_data to out_data
+  /* int * fluffed_int = (int *)(fluffed_data);
+  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  int * out_int = (int *)out_data;*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
+
+  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
+  /*  int tile_size = 3; // set by benchmarking
+  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
+
+    for (int i=0;i<NANTS;i+=tile_size) {
+      for (int j=0;j<384*2;j++) {
+	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
+      }
+    }
+
+    }*/
+
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
+  
+  // place in out
+  if (d->write)
+    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  else
+    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
+  
+  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
+  
+  // free stuff
+  free(proc_data);
+  //free(fluffed_data);
+  free(out_data);
+  
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+
+  // run test with single thread
+
+  syslog(LOG_INFO,"Running TEST...\n");
+  
+  // set up data structure
+  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
+  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
+  memset(test_block,0,sizeof(test_block));
+  
+  /* TEST CODE 
+  FILE *fin;
+  fin=fopen("../utils/packet.out","rb");
+  fread(test_block, 96768, 1, fin);
+  fclose(fin);
+   END TEST CODE */
+  
+  args[0].in = test_block;
+  args[0].out = test_output;
+  args[0].n_threads = 1;
+  args[0].thread_id = 0;
+  args[0].debug = 0;
+  args[0].write = 0;
+
+  // run test thread
+  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
+    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
+  }
+  else
+    syslog(LOG_INFO,"Created TEST thread\n");
+  pthread_attr_destroy(&attr);    
+  pthread_join(threads[0], &result);
+  syslog(LOG_INFO,"joined TEST thread");
+
+  /* TEST CODE 
+  fin=fopen("../utils/test.out","wb");
+  fwrite(test_output, 1, 196608, fin);
+  fclose(fin);
+  END TEST CODE */
+  
+  // clean up
+  free(test_block);
+  free(test_output);
+
+  syslog(LOG_INFO,"TEST COMPLETE");
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (bf) {
+    hdu_out2  = dada_hdu_create ();
+    dada_hdu_set_key (hdu_out2, out_key2);
+    if (dada_hdu_connect (hdu_out2) < 0) {
+      syslog (LOG_ERR,"could not connect to output  buffer2");
+      return EXIT_FAILURE;
+    }
+    if (dada_hdu_lock_write(hdu_out2) < 0) {
+      syslog (LOG_ERR, "could not lock to output buffer2");
+      return EXIT_FAILURE;
+    }
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * blockie;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // sort out write
+    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 0;      
+    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
+    
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = block;
+      args[i].out = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].debug = 0;
+      args[i].ipc = hdu_out->data_block;
+      args[i].write = 1;
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
+    }
+    
+    // write to output
+
+    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    
+    if (bf) {
+
+      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	  dsaX_dbgpu_cleanup (hdu_in,0);
+	  dsaX_dbgpu_cleanup (hdu_out,1);
+	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	  return EXIT_FAILURE;
+	}
+
+    }
+
+    // finish write
+    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
+    ipcio_check_pending_sod (hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 1;      
+    //ipcio_close_block_write(hdu_out->data_block, block_out);
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+  
+}
+
+
diff --git a/legacy/dsaX_simplesplit.c b/legacy/dsaX_simplesplit.c
new file mode 100644
index 0000000..7a80c7e
--- /dev/null
+++ b/legacy/dsaX_simplesplit.c
@@ -0,0 +1,362 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int DEBUG = 0;
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_split [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -b connect to bf hdu\n"
+	   " -i in_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -o out_key [default CAPTURED_BLOCK_KEY]\n"
+	   " -j out_key2 [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_simplesplit", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURE_BLOCK_KEY;
+  key_t out_key = CAPTURED_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:j:dbh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key2) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (bf) {
+    hdu_out2  = dada_hdu_create (0);
+    dada_hdu_set_key (hdu_out2, out_key2);
+    if (dada_hdu_connect (hdu_out2) < 0) {
+      syslog (LOG_ERR,"could not connect to output  buffer2");
+      return EXIT_FAILURE;
+    }
+    if (dada_hdu_lock_write(hdu_out2) < 0) {
+      syslog (LOG_ERR, "could not lock to output buffer2");
+      return EXIT_FAILURE;
+    }
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * o1, * o2;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  char * output = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    
+    // DO STUFF
+
+        
+    // copy to output buffer
+    memcpy(output_buffer, block, block_size);      
+
+    // do write
+    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    if (bf) 
+      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
+    
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+  free(output);
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  
+}
+
+
diff --git a/legacy/dsaX_splice.c b/legacy/dsaX_splice.c
new file mode 100644
index 0000000..b91e665
--- /dev/null
+++ b/legacy/dsaX_splice.c
@@ -0,0 +1,201 @@
+/* This works pretty much like the trigger code. receives a control UDP message 
+to store some data for a fixed amount of time.
+Message format: length(s)-NAME
+Will ignore messages until data recording is over
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/syscall.h>
+#include <syslog.h>
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <src/sigproc.h>
+#include <src/header.h>
+
+
+FILE *output;
+
+void send_string(char *string) /* includefile */
+{
+  int len;
+  len=strlen(string);
+  fwrite(&len, sizeof(int), 1, output);
+  fwrite(string, sizeof(char), len, output);
+}
+
+void send_float(char *name,float floating_point) /* includefile */
+{
+  send_string(name);
+  fwrite(&floating_point,sizeof(float),1,output);
+}
+
+void send_double (char *name, double double_precision) /* includefile */
+{
+  send_string(name);
+  fwrite(&double_precision,sizeof(double),1,output);
+}
+
+void send_int(char *name, int integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(int),1,output);
+}
+
+void send_char(char *name, char integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(char),1,output);
+}
+
+
+void send_long(char *name, long integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(long),1,output);
+}
+
+void send_coords(double raj, double dej, double az, double za) /*includefile*/
+{
+  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
+  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
+  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
+  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
+}
+
+
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+int trignum = 0;
+int dumpnum = 0;
+char iP[100];
+char srcnam[1024];
+float reclen;
+int DEBUG = 0;
+
+void usage()
+{
+  fprintf (stdout, "dsaX_splice [16 files]\n");
+}
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_splice", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // set up input array
+  // 16 corrs, 3840 times, 256 beams, 48 chans
+  char * bigarr = (char *)malloc(sizeof(char)*16*3840*256*48);
+  char foutnam[200];
+
+  // read into input array
+  FILE *fin;
+  for (int i=1;i<17;i++) {
+    fin=fopen(argv[i],"rb");
+    fread(bigarr+(i-1)*3840*256*48,3840*256*48,1,fin);
+    fclose(fin);
+  }
+
+  // reorder bigarr
+  char * tarr = (char *)malloc(sizeof(char)*16*3840*256*48);
+  int oidx, iidx;
+  // order is beam, time, freq
+  for (int i=0;i<16;i++) {
+    for (int j=0;j<3840;j++) {
+      for (int k=0;k<256;k++) {
+
+	iidx = i*3840*256*48 + j*256*48 + k*48;
+	oidx = k*3840*768 + j*768 + i*48;
+	memcpy(tarr + oidx, bigarr + iidx, 48);
+
+      }
+    }
+  }
+  free(bigarr);
+
+  // loop over beams and write out all filterbanks
+  for (int i=0;i<256;i++) {
+    
+    sprintf(foutnam,"/home/ubuntu/data/fb_%d.fil",i);    
+    
+    if (!(output = fopen(foutnam,"wb"))) {
+      printf("Couldn't open output file\n");
+      return 0;
+    }
+    
+    send_string("HEADER_START");
+    send_string("source_name");
+    sprintf(srcnam,"fb_%d",i);
+    send_string(srcnam);
+    send_int("machine_id",1);
+    send_int("telescope_id",82);
+    send_int("data_type",1); // filterbank data
+    send_double("fch1",1498.75); // THIS IS CHANNEL 0 :)
+    send_double("foff",-0.244140625);
+    send_int("nchans",768);
+    send_int("nbits",8);
+    send_double("tstart",55000.0);
+    send_double("tsamp",8.192e-6*8.*16.);
+    send_int("nifs",1);
+    send_string("HEADER_END");
+
+    fwrite(tarr + i*2949120,2949120,1,output);
+    fclose(output);
+
+  }
+
+  // write out full filterbank
+  sprintf(foutnam,"/home/ubuntu/data/fb_all.fil");    
+  
+  if (!(output = fopen(foutnam,"wb"))) {
+    printf("Couldn't open output file\n");
+    return 0;
+  }
+    
+  send_string("HEADER_START");
+  send_string("source_name");
+  sprintf(srcnam,"fb_all");
+  send_string(srcnam);
+  send_int("machine_id",1);
+  send_int("telescope_id",82);
+  send_int("data_type",1); // filterbank data
+  send_double("fch1",1498.75); // THIS IS CHANNEL 0 :)
+  send_double("foff",-0.244140625);
+  send_int("nchans",768);
+  send_int("nbits",8);
+  send_double("tstart",55000.0);
+  send_double("tsamp",8.192e-6*8.*16.);
+  send_int("nifs",1);
+  send_string("HEADER_END");
+  
+  fwrite(tarr,16*3840*256*48,1,output);
+  fclose(output);
+
+  
+  free(tarr);
+  
+}
diff --git a/legacy/dsaX_split.c b/legacy/dsaX_split.c
new file mode 100644
index 0000000..1361e86
--- /dev/null
+++ b/legacy/dsaX_split.c
@@ -0,0 +1,601 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+/* global variables */
+int DEBUG = 0;
+int STATS = 0;
+const int nth = 4;
+
+// data to pass to threads
+struct data {
+  char * in;
+  char * out;
+  char * out2;
+  int bf;
+  int reorder;
+  int n_threads;
+  int thread_id;
+};
+int cores[8] = {10, 11, 12, 13, 14, 15, 16, 17};
+
+
+void * massage (void *args) {
+
+  struct data *d = args;
+  int thread_id = d->thread_id;
+
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+
+  // extract from input
+  char *in = (char *)d->in;
+  int bf = d->bf;
+  int reorder = d->reorder;
+  int n_threads = d->n_threads;  
+  
+  if (!reorder) {
+    memcpy(d->out + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT);
+    if (bf)
+      memcpy(d->out2 + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT);
+  }
+  else {
+  
+    // block for transpose
+    int block = 16;
+  
+    for (int i=(int)(thread_id*(2048/n_threads));i<(int)((thread_id + 1)*2048/n_threads);i++) { // over time
+      for (int i1 = 0; i1 < 48; i1 += block) {
+	for(int j = 0; j < NANT; j++) {
+	  for(int b = 0; b < block && i1 + b < 48; b++) {
+	    memcpy(d->out + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32);
+	    if (bf) memcpy(d->out2 + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32);
+	  }
+	}
+      }
+    }    
+
+  }
+    
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+void reorder_block(char *block, char *output);
+void calc_stats(char *block);
+
+// calculates rms for each pol from the first packet in each block. 
+// block has shape [2048 time, NANT antennas, 768 channels, 2 pol, r/i]
+void calc_stats(char *input) {
+
+  float rmss[NANT*2];
+  int iidx;
+  for (int i=0;i<NANT*2;i++) rmss[i] = 0.;
+
+  for (int ant=0;ant<NANT;ant++) {
+    for (int chan=0;chan<768;chan++) {
+      for (int pol=0;pol<2;pol++) {
+
+	iidx = ant*1536+chan*2+pol;
+	
+	rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 15) << 4)) >> 4),2.);
+	rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 240))) >> 4),2.);
+
+      }
+    }
+  }
+
+  for (int i=0;i<NANT;i++) {
+    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.0),sqrt(rmss[2*i+1]/768.0));
+  }
+
+}
+
+// performs cpu reorder of block to be loaded to GPU
+void reorder_block(char * block, char * output) {
+
+  // from [2048 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
+  // to [2048 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
+  // 24576*NANT in total. 1536*NANT per time
+  
+  for (int i=0;i<2048;i++) { // over time
+    for (int k=0;k<48;k++) { // over channels
+      for (int j=0;j<NANT;j++) { // over ants
+	// copy 32 bytes
+	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
+	
+      }
+    }
+  }
+
+  //memcpy(block,output,24576*NANT);
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_split [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -m multithread write\n"
+	   " -b connect to bf hdu\n"
+	   " -r reorder\n"
+	   " -i in_key [default CAPTURE_BLOCK_KEY]\n"
+	   " -o out_key [default CAPTURED_BLOCK_KEY]\n"
+	   " -j out_key2 [default REORDER_BLOCK_KEY2]\n"
+	   " -s stats\n"
+	   " -f send fake blocks through [default 0]\n"
+	   " -h print usage\n");
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_split", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURE_BLOCK_KEY;
+  key_t out_key = CAPTURED_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int bf = 0;
+  int arg = 0;
+  int reorder = 0;
+  int mwrite = 0;
+  int fake = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:j:f:smdbrh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      fake = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key2) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'r':
+	  reorder=1;
+	  syslog (LOG_INFO, "Will do reorder");
+	  break;
+	case 'm':
+	  mwrite=1;
+	  syslog (LOG_INFO, "Will do multithread write");
+	  break;
+	case 's':
+	  STATS=1;
+	  syslog (LOG_INFO, "Will print stats");
+	  break;
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (bf) {
+    hdu_out2  = dada_hdu_create (0);
+    dada_hdu_set_key (hdu_out2, out_key2);
+    if (dada_hdu_connect (hdu_out2) < 0) {
+      syslog (LOG_ERR,"could not connect to output  buffer2");
+      return EXIT_FAILURE;
+    }
+    if (dada_hdu_lock_write(hdu_out2) < 0) {
+      syslog (LOG_ERR, "could not lock to output buffer2");
+      return EXIT_FAILURE;
+    }
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  uint64_t nints = block_size / block_out;
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * o1, * o2;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  char * output = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // set up threads
+  struct data args[8];
+  pthread_t threads[8];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  // send through fake blocks
+
+  if (fake>0) {
+    syslog(LOG_INFO,"sending %d fake blocks",fake);
+    for (int i=0;i<fake;i++) {
+      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
+      memcpy(o1, output, block_out);
+      ipcio_close_block_write (hdu_out->data_block, block_out);
+      usleep(10000);
+    }
+    syslog(LOG_INFO,"Finished with fake blocks");
+  }
+  
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    
+    // DO STUFF
+
+    for (int myint=0;myint<nints;myint++) {
+        
+      // copy to output buffer
+                  
+      memcpy(output_buffer, block + myint*block_out, block_out);      
+
+      if (mwrite) {
+	o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
+	if (bf) o2 = ipcio_open_block_write (hdu_out2->data_block, &block_id);
+      }
+      
+      // stats
+      if (STATS) calc_stats(output_buffer);
+      
+      //if (reorder) {
+      
+      // set up data structure
+      for (int i=0; i<nth; i++) {
+	args[i].in = output_buffer;
+	args[i].reorder = reorder;
+	args[i].bf = 0;
+	if (mwrite) {
+	  args[i].out = o1;	
+	  if (bf) {
+	    args[i].out2 = o2;
+	    args[i].bf = 1;
+	  }
+	}
+	else
+	  args[i].out = output;
+	args[i].n_threads = nth;
+	args[i].thread_id = i;
+      }
+      
+      //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nth);
+      syslog(LOG_INFO, "creating threads");
+      
+      for(int i=0; i<nth; i++){
+	if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+	  syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+	}
+      }
+      
+      pthread_attr_destroy(&attr);
+      if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+      
+      for(int i=0; i<nth; i++){
+	pthread_join(threads[i], &result);
+	if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+      }
+      
+      
+      if (!mwrite) {
+	if (reorder && (!bf))
+	  written = ipcio_write (hdu_out->data_block, output, block_out);
+	else 
+	  written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+	
+	if (bf) {
+	  written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+	  if (reorder)
+	    written = ipcio_write (hdu_out2->data_block, output, block_out);
+	  else
+	    written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
+	}
+      }
+      else {
+	ipcio_close_block_write (hdu_out->data_block, block_out);
+	if (bf) ipcio_close_block_write (hdu_out2->data_block, block_out);
+      }
+      
+      if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+      blocks++;
+      
+      
+      if (bytes_read < block_size)
+	observation_complete = 1;            
+      
+    }
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+  free(output);
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  
+}
+
+
diff --git a/legacy/dsaX_splitup.c b/legacy/dsaX_splitup.c
new file mode 100644
index 0000000..32f055d
--- /dev/null
+++ b/legacy/dsaX_splitup.c
@@ -0,0 +1,285 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_splitup", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  uint64_t nsplits = block_size/block_out;
+  char * block, * output_buffer;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // do multiple writes
+
+    for (uint64_t i=0;i<nsplits;i++) {
+
+      memcpy(output_buffer,block+i*block_out,block_out);
+
+      // write to output
+      written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	  return EXIT_FAILURE;
+	}
+      
+      if (DEBUG) {
+	syslog(LOG_DEBUG, "written block %d",blocks);      
+      }
+      blocks++;
+
+    }
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_store.c b/legacy/dsaX_store.c
new file mode 100644
index 0000000..849c27c
--- /dev/null
+++ b/legacy/dsaX_store.c
@@ -0,0 +1,218 @@
+/* Code to read from a raw data buffer and write to disk */
+
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <syslog.h>
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in)
+{
+  
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_dbdisk [options]\n"
+	   " -c core   bind process to CPU core\n"
+	   " -k in_key [default fafa]\n"
+	   " -h print usage\n");
+}
+
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_store", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  
+  // input data block HDU key
+  key_t in_key = 0x0000fafa;
+
+  // command line arguments
+  uint64_t blocksize;
+  uint64_t bout = 32*NSNAPS*4608; // output block size - assume input is a multiple.
+  int core = -1;
+  int arg=0;
+
+  while ((arg=getopt(argc,argv,"c:k:h")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      printf ("ERROR: -c flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'k':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-k flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // DADA stuff
+
+  // open connection to the in/read DB
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to input buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"dsaX_correlator_copy: could not lock to input buffer");
+    return EXIT_FAILURE;
+  }
+  
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      syslog(LOG_INFO,"binding to core %d", core);
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"dsaX_correlator_copy: failed to bind to core %d",core);
+    }
+  
+  // more DADA stuff - deal with headers
+  
+  uint64_t header_size = 0;
+
+  // read the header from the input HDU
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "main: could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  
+  // mark the input header as cleared
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared [input]");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  int observation_complete=0;
+
+  // stuff for writing data
+  blocksize = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  char * cpbuf = (char *)malloc(sizeof(char)*blocksize);
+  char * outbuf = (char *)malloc(sizeof(char)*bout);
+  int ngulps = (int)(blocksize/bout);
+  int gulp = 0, wseq = 0;;
+  char *in_data;
+  uint64_t written=0, written2=0;
+  uint64_t block_id, bytes_read=0;
+  FILE *fout;
+  char fnam[100];
+  
+
+  syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout);
+
+  
+  // main reading loop
+
+  syslog(LOG_INFO, "main: starting read");
+
+  while (!observation_complete) {
+
+    // read a DADA block
+    in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    // copy
+    memcpy(cpbuf, in_data, blocksize);
+    syslog(LOG_INFO, "starting new write (seq %d)",wseq);
+
+    // open file for writing
+    sprintf(fnam,"/home/ubuntu/data/fl_%d.out",wseq);
+    fout = fopen(fnam,"wb");
+    for (gulp=0;gulp<ngulps;gulp++) {
+
+      // copy to outbuf
+      memcpy(outbuf, cpbuf+gulp*bout, bout);
+
+      // write
+      usleep(40000);
+      fwrite(outbuf, 1, bout, fout);
+
+    }
+    fclose(fout);
+    wseq++;
+    syslog(LOG_INFO, "main: finished new write to file %s",fnam);
+    
+    // for exiting
+    if (bytes_read < blocksize) {
+      observation_complete = 1;
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize);
+    }
+
+    // close block for reading
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+  
+  free(cpbuf);
+  free(outbuf);
+  dsaX_dbgpu_cleanup (hdu_in);
+  
+}
+  
diff --git a/legacy/dsaX_testdada.c b/legacy/dsaX_testdada.c
new file mode 100644
index 0000000..bbe7640
--- /dev/null
+++ b/legacy/dsaX_testdada.c
@@ -0,0 +1,161 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+#include "xgpu.h"
+
+// print fn
+void print_arr(char *ptr, int len) {
+  printf("\n[");
+  for (int i = 0; i < len; i++) {
+    printf(" %08x,", ptr[i]);
+  }
+  printf(" ]\n");
+}
+
+// read and write functions
+
+int write_block(dada_hdu_t* hdu_in) {
+
+  dada_hdu_lock_write(hdu_in);
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  char * data = (char *)malloc(sizeof(char)*block_size);
+  memset(data, 0, block_size);
+  ipcio_write (hdu_in->data_block, data, block_size);
+  free(data);
+  dada_hdu_unlock_write (hdu_in);
+  
+}
+
+int read_block(dada_hdu_t* hdu_in) {
+
+  dada_hdu_lock_read(hdu_in);
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  char * data = (char *)malloc(sizeof(char)*block_size);
+  char * block;
+  uint64_t  bytes_read, block_id;
+  
+  block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+  memcpy(data, block, bytes_read);
+  print_arr(data, (int)(bytes_read));
+  
+  free(data);
+  ipcio_close_block_read (hdu_in->data_block, bytes_read);
+  dada_hdu_unlock_read (hdu_in);
+  
+}
+
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  
+  // command line arguments
+  int arg = 0;
+  char *hout;
+  hout = (char *)malloc(sizeof(char)*4096);
+
+  
+  while ((arg=getopt(argc,argv,"i:h:")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      sscanf (optarg, "%x", &in_key);
+	      break;
+	    }
+	case 'h':
+	  if (optarg)
+	    {
+	      fileread (optarg, hout, 4096);
+	      break;
+	    }	 
+	}
+    }
+  
+  // DADA stuff  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  dada_hdu_connect (hdu_in);
+
+  /*
+  // deal with header
+  dada_hdu_lock_write(hdu_in);
+  char * header_out = ipcbuf_get_next_write (hdu_in->header_block);
+  memcpy (header_out, hout, 4096);
+  ipcbuf_mark_filled (hdu_in->header_block, 4096);
+  dada_hdu_unlock_write(hdu_in);
+  free(hout);
+
+  dada_hdu_lock_read(hdu_in);
+  uint64_t header_size = 0;
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  ipcbuf_mark_cleared (hdu_in->header_block);
+  dada_hdu_unlock_read(hdu_in);
+  */
+
+  // do four reads and four writes
+
+  while (1) {
+  
+    printf("writing four blocks... ");
+    for (int i=0;i<4;i++) {
+      write_block(hdu_in);
+      sleep(0.5);
+    }
+    printf("written\n");
+    
+    sleep(2);
+    
+    printf("reading four blocks... ");
+    for (int i=0;i<4;i++) {
+      read_block(hdu_in);
+      sleep(0.5);
+    }
+    printf("read\n");
+    
+  }
+  
+}
+
+
diff --git a/legacy/dsaX_trigger.c b/legacy/dsaX_trigger.c
new file mode 100644
index 0000000..9592389
--- /dev/null
+++ b/legacy/dsaX_trigger.c
@@ -0,0 +1,585 @@
+/* Code to read from a single dada buffer, and write to disk upon receiving
+a trigger. Uses pthread threads and shared memory to listen. 
+Sequence of events:
+ - starts null-reading dump buffer, while listening for socket command
+   + for N second dump, assume N-second dada blocks
+ - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. 
+ - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending.
+*/
+
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+#include "dsaX_capture.h"
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+
+// data to pass to threads
+struct cdata {
+  char * in;
+  dada_hdu_t * hdu_out;
+};
+
+
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+uint64_t specnum = 0;
+uint64_t procnum = 0;
+int trignum = 0;
+int dumpnum = 0;
+char iP[100];
+char footer_buf[1024];
+int DEBUG = 0;
+volatile int docopy = 0;
+volatile int dumping = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+  
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+  
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_correlator_trigger [options]\n"
+	   " -c core   bind process to CPU core\n"
+	   " -i IP to listen to [no default]\n"
+	   " -j in_key [default eaea]\n"
+	   " -o out_key [default fafa]\n"
+	   " -d debug\n"
+	   " -f full_pct [default 0.8]\n"
+	   " -n output file name [no default]\n"
+	   " -s skip N blocks [default 0]\n"
+	   " -h print usage\n");
+}
+
+// thread to control writing of data to buffer 
+
+void copy_thread (void * arg) {
+
+  struct cdata *d = arg;
+  char *in = (char *)d->in;
+  dada_hdu_t * hdu_out = (dada_hdu_t *)d->hdu_out;
+
+  uint64_t written = 0;
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO,"in thread... blocksize %"PRIu64"",block_size);
+  
+  while (1) {
+
+    while (docopy==0) usleep(100);
+  
+    written = ipcio_write (hdu_out->data_block, in, block_size);
+
+    dumping = 0;
+    dump_pending = 0;
+    docopy=0;
+
+    syslog(LOG_INFO,"Finished writing trigger");
+
+  }
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+  
+}
+
+// Thread to control the dumping of data
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = ctx->control_port;
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,"11227",&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+    memset(tbuf,0,bufsize);
+    strcpy(tbuf,buffer);
+    trignum++;
+
+    // interpret buffer string
+    char * rest = buffer;
+    tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0));
+    
+    if (!dump_pending) {
+      //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
+      specnum = tmps;
+      strcpy(footer_buf,tbuf);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum);
+    }
+	
+    if (dump_pending)
+      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps);
+  
+    if (!dump_pending) dump_pending = 1;
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+  free (tbuf);
+
+  if (ctx->verbose)
+    syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+	    
+
+	
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_trigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  /* port for control commands */
+  int control_port = TRIGGER_CONTROL_PORT;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // input data block HDU key
+  key_t in_key = 0x0000eaea;
+  key_t out_key = 0x0000fafa;
+
+  // command line arguments
+  int core = -1;
+  float full_pct = 0.8;
+  int arg=0;
+  int skips = 0;
+
+  while ((arg=getopt(argc,argv,"i:c:j:o:f:d:s:h")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -c flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      full_pct = atof(optarg);
+	      syslog(LOG_INFO,"Using full_pct %f",full_pct);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -f flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 's':
+	  if (optarg)
+	    {
+	      skips = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog (LOG_ERR,"ERROR: -s flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_INFO, "Will excrete all debug messages");
+	  break;
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'j':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-j flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // DADA stuff
+  
+  udpdb.verbose = DEBUG;
+  udpdb.control_port = control_port;
+  
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  syslog(LOG_INFO, "starting control_thread()");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+
+  
+  syslog (LOG_INFO, "creating hdus");
+
+  // open connection to the in/read DBs
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+    syslog (LOG_ERR,"could not lock4 to eada buffer");
+    return EXIT_FAILURE;
+  }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      syslog(LOG_INFO,"binding to core %d", core);
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+    }
+
+  int observation_complete=0;
+  
+  // more DADA stuff - deal with headers
+  
+  uint64_t header_size = 0;
+
+  // read the header from the input HDU
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "main: could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // now write the output DADA header
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // copy the in header to the out header
+  memcpy (header_out, header_in, header_size);
+
+  // mark the input header as cleared
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared [input]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // mark the output header buffer as filled
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  // stuff for writing data
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  uint64_t specs_per_block = 2048;
+  uint64_t specs_per_out = 2048*NOUTBLOCKS;
+  uint64_t current_specnum = 0; // updates with each dada block read
+  uint64_t start_byte, bytes_to_copy, bytes_copied=0;
+  char * out_data = (char *)malloc(sizeof(char)*block_out);
+  char * in_data;
+  uint64_t written=0;
+  uint64_t block_id, bytes_read=0;
+  FILE *ofile;
+  ofile = fopen("/home/ubuntu/data/dumps.dat","w");
+  fprintf(ofile,"starting...\n");
+  fclose(ofile);
+
+
+  // thread for copying data
+  struct cdata cstruct;
+  cstruct.in = out_data;
+  cstruct.hdu_out = hdu_out;  
+  rval = 0;  
+  pthread_t copy_thread_id;
+  syslog(LOG_INFO, "starting copy_thread()");
+  rval = pthread_create (&copy_thread_id, 0, (void *) copy_thread, (void *) &cstruct);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating copy_thread: %s", strerror(rval));
+    return -1;
+  }
+
+
+  // main reading loop
+  float pc_full = 0.;
+  int block_count = 0;
+  syslog(LOG_INFO, "main: starting observation");
+
+  while (!observation_complete) {
+
+       // read a DADA block
+      in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    
+      // add delay
+      // only proceed if input data block is 80% full
+      while (pc_full < full_pct) {
+	pc_full = ipcio_percent_full(hdu_in->data_block);
+	usleep(100);
+      }
+      pc_full = 0.;
+      
+    
+      // check for dump_pending
+      if (dump_pending) {
+
+	// look after hand trigger
+	if (specnum==0) {
+
+	  specnum = current_specnum + 100;
+	  
+	}
+	
+	// if this is the first block to dump
+	if (specnum >= current_specnum && specnum < current_specnum+specs_per_block) {
+
+	  dumping = 1;
+	  
+	  // find start byte and bytes to copy
+	  start_byte = 4608*NSNAPS*(specnum-current_specnum);
+	  bytes_to_copy = block_size-start_byte;
+	  
+	  // do copy
+	  memcpy(out_data, in_data+start_byte, bytes_to_copy);
+	  //written = ipcio_write (hdu_out->data_block, in_data+start_byte, bytes_to_copy);
+	  bytes_copied = bytes_to_copy;
+	  
+	}
+
+	// if this is one of the middle blocks to dump from
+	if (specnum < current_specnum && specnum + specs_per_out > current_specnum + specs_per_block && dumping==1) {
+
+	  // do copy
+	  memcpy(out_data + bytes_copied, in_data, block_size);
+	  //written = ipcio_write (hdu_out->data_block, in_data, block_size);
+	  bytes_copied += block_size;
+
+	}
+
+	// if this is the last block to dump from
+	if (specnum + specs_per_out > current_specnum && specnum + specs_per_out <= current_specnum + specs_per_block && dumping==1) {	  
+
+	  // find start byte and bytes to copy
+	  bytes_to_copy = block_out-bytes_copied;
+
+	  // do copy
+	  memcpy(out_data+bytes_copied, in_data, bytes_to_copy);
+	  //written = ipcio_write (hdu_out->data_block, in_data, bytes_to_copy);
+
+	  // DO THE WRITING
+	  /*written = ipcio_write (hdu_out->data_block, out_data, block_out);
+
+	  if (written < block_out)
+	    {
+	      syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	      return EXIT_FAILURE;
+	    }
+	  */
+
+	  // DO writing using thread
+	  docopy = 1;
+	  
+	  syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
+	  fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	  fclose(ofile);
+	  
+	  dumpnum++;
+	  
+	  // reset
+	  bytes_copied = 0;
+	  
+	}
+
+	// if trigger arrived too late
+	if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
+	  syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
+
+	  bytes_copied=0;
+	  dump_pending=0;
+
+	}
+
+	
+      }
+
+      // update current spec
+      syslog(LOG_INFO,"current_specnum %lu",current_specnum);
+      if (block_count < skips) {
+	block_count++;
+      }
+      else
+	current_specnum += specs_per_block;
+      
+
+      // for exiting
+      if (bytes_read < block_size) {
+	observation_complete = 1;
+	syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
+      }
+
+      // close block for reading
+      ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+
+  }
+
+
+  // close threads
+  syslog(LOG_INFO, "joining control_thread");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+  result=0;
+  pthread_join (copy_thread_id, &result);
+
+  free(out_data);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+
+}
diff --git a/legacy/dsaX_wrangle b/legacy/dsaX_wrangle
new file mode 100755
index 0000000000000000000000000000000000000000..f839b14c334758201c3b8885fb58a899eb6e804d
GIT binary patch
literal 99600
zcmeEvdtg-6@&C<>gaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_
z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m
z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b
z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z|
z(<};uPnN~pEqRK^a#^}umM&+ehL+z<P5sD@-&q>pKb-~~KW0i=ev01rm-Bk{ddcpU
zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;=
zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>}
z;A^&<yOo&{iG-bo@9FqvAA0SmZSOT7GWoOMsK<W))o_5~emtmrAcE84z;}YvK=g%i
z;QImY#<%~c3JMIw=W`Ga1ivm0{hgp^9sNIt#-V>6gah&aTO9hiap=?H(7zG~zA6rU
zS{(S<aq2xRPP@DthffgnZt&{=IUo-G$~f?aU@(x~M#rJQDGvPcIDDRt1OF%vpReQ4
zFN*`eI}V@1IPhEJ@cCyP_$_hZ_r-xf7^mKo<FxOEap<?j;s2*N@XO=CZ;Qh}Hx4`}
zj-0=V!~a)t;BUs^-w+2r9(>&R_Wy9G9>_mO#*uSg9QscIKe9h_<fq1=&j9~{;?E;-
z=s6w^B<C{F?`LEg?ryi#ZjP6`kQ;7f7`-<N*f;}p{A}0sdHv`&X?WH<Zju=$9|Pl3
zoxif)S6a@&D90!%m^Hnm+*j{gP}Sh~)z6xKW_4|iZ&vC2YM;Ulh+%o7q{LTWUsGFB
zU0YV_ud1yvsvCU1%UDAhd7lh|27g^y-I9{B%FB#~%7qJje!%MLt7`le0`oOh`Hcnz
zFe+;6d^JXGoxiH4+^7TV+Iki$Wtpmm+Oo-$OBx!=N^2^N2H(Omf3>hLExU}IN-9dL
zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_
z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t<b{qJ|o+S4l~QL*)j4d2OTLSO^vB
zmKYUf)wK;iW1%G*ENQU(f`!KX%YF5=sHDulq|R3|pJt|YO6w^ITCCncosCuHMuoCU
z#p3!Zzt7N>84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP
zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h
zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM;
z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq#
zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=-
z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8
zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH
zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg
zYZmjf-T^<r1R`y7zz=l5I~?#V2fWh(Kga>^a=;IEz`GsrLmcq113t<D?{&Zrb-?=^
z@WUK%L;Ee;{BQ?6%>j2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!<jsrf%0iWc6AMJqW
zIpD`Q;9dtj+W{|fz{fh^a~$wv9q@S$_;C(+r2{_B0k3nwIj1u}O%C`36Nt3g0YBaW
zU+IA7IN+-s@QDuiY6tuT2YihKev$)zp97xjfUkAHPj<l9JK&QX@HPkh6bHP+0YB9N
z_qOax^9ECj|K>Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6*
z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE
z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_
zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR
zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V
zlE0aGhNy^J^4BxZ&=kp({58xU#=If<tC(jfiu8U<{V!+U&3w1y>zQZhiF8W-Qsx<Q
zB5jhN&pbm-WUb^cVxA!;vPSYVnP+H;tdjgW%rm4!nk9b*^9&`CI?12HJVQuip5#wp
zo}nXBB>7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N
ze-?R$fJnFGKVzO<Khi1rkC<ndkF-htUFJDuiLABCSFA!_KKU2q&)(3C%lzW}S^0rI
z#pioNQ#UR*3~%6zr-uOU4IMgX1(IdE`jGQXjOIT@w=DMtlgDHeA6Py*jC5jKxEt*i
z?m;Eq(8v{F5!n1R`{l&8P3&HUA^+%0!h4{TH#BYZy!L6MFL~nu!$3OkjSb{8ZFE}2
zDir-#YX5jz;KT4~;Mmd^buFI^MlIp8mcGoz|MZq6w|En}qTP%KpuZdRQqd;%+KFxX
zTpJqU>rqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~
zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^
zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT
zV}<K*h=~v6`Kc+-!w`2vI?H?+-udrnG(WH{@(_qUt^aZbyg-8a_7;=5GFJF9u!JE3
z6^0Q+19=0_cmwU>Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C
zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8(
z<zuM=D8v6k_YkRG5RUr+UJ$%ubY^%d;sy%2mY1Sw*wmHYz{iy3{coaCA@M@V%J7j#
zXa?^KgHDPN8kuGpeh?&pLg&GVs4y)&216N(rxgUY(@`_S`vZa^DJW79Xe$hSDy>ai
zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R
z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH
z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3r<RY-
zbY109O@X0%7Al8L-QkLJAz5n_jqh51kFf07CM+kDCF}e!{7Y!nGWBHzPX@dNY9+W@
zIg$pUQfu3^(7XiK^7TNW<q89zgpWf@P78ET3zUz}X=!VAwVni8FLarNUVID8kB}68
z^=mY5VC%N8FQBhp4XBtARZ7R*dgtXAO?%2HXlYLn`xh*WHvbH3-xn!UdS5hZnlthm
zx(hhW{w3RAcx-~EM%aT2gjM7O^p|On>sd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G
zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{
zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0#
z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE
z;G+aqAus$7u#3E{khk?mt|#&VAcKXB<s)zDN8p*bGkgPRv^#_>G}TiMu?!Azj}fl1
zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~
zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf
zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H
zMzJ@Pas*QE<UViUeg8q2f^Pv0r{KfpM!PE}nB&XbXE@5*d3Z#GgrVGbkraE!euwhj
z&EF+?1HInO;nTed+r2M+=g$JefMz(94CC=@p1K)iM&t3`mZ>GYDyBR`F|1vRTzEQc
zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R
z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA
zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E
z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP
zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)<zB~WXrQXGttVj^lv=xbxFJF-)*e&xv
z^LZC<C2r2pQ~_<%pilkYIc-Gjmz-MA(w0}Stgj70qw!V`1{7X-Lcgn%ATYOMdRg09
zp`o`STpRhD-%Bjm+?_bR?9FpR8&4#mh+UZESwFq(+390nrzdUuCNtIB(w@BSn>mBF
zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL
z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S
z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@
z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N
zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>k<vjRl
zHv|7?AhYMOkuyJ!M%kMnXpT3qG#qH$=?xTj1*Ug<gT-CmK&KaN?@j3Rrew<C?dkKj
zEDalt;}Hq2dkP$bP-pXo!%aC+L)tG8Q+rVPdUW4t<S$ew=n12PB?pVawy`hh*@?;9
zuQAkm(ea226uBi0=D8898jl6{i8l8`9o(P09Na@D_nvtWr^{%}v`Po_vb_OMmz58#
z62Vm^lN7j!X_q&_tUHh3c2uGzZO`-adOd%heTL>aANH99`*Z{vw+4#a+C4jn^R|1!
zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mc<a;b
ze=yW1A4j8Nr;b1ore>awU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I
zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#z<B033D0
z6^iZcYEW8(B>XO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7
zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^<mO%8>BE24?QEuNhTw$hV5
zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z
z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7
z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc
z1`<s>TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z
zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp
z|0Fv`egG4xcCL?gAVB#dl<x{OzJl`T8PnTPzH?e&`bI>9UeGZr^f7Js>{^F;Z*cl9
znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5
z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k
z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt
ze<e_ee&6EhBd@;HMO~I|laxpj;#u={v_Q(Y>zsAuiP{ofwgf$|z#m1Zpr_Y+s;8s=
zX-ld0yu28zwC7FgmoIYW^{`<Hf1bq%GQ-U2{j4L6NyGv15To0ttx|TjReeCXw&!JA
zRh#wF1KRU!k=pa-m^AHqT#HBH9O90iE=-BwiAP{LqCIiT<CZ77QOfj0+=yU=59={K
z@g1}x>&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr
zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^
z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ
ze)pJD<h4$|w^)3SkZu$7bb2Ry`dnANhdDHBYL~6Z_26koIz~#)2-`h-)}hK^@tz>2
zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0`
zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?|
zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|B<i
z>Y0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM
znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o
z4U+j7<tUa+j+-#J%BA?J?%h&n_$9kp)|10B<VoC~mqsttwf8L6DrN?YcYDkJZp932
z4o;1O+Pf~FT~{IN8ma3-%f#|+mjSxPvs-4RK)Zu7hzC8pF^{>r4uYBH>G_4Op3_kO
z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK
zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY
zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xf<nZwH?^JNTRu$*ag
z%Rvz{Sl&Wz51ZV2rjeDnLmpbY5gTZpzINO(hE3sgb&bWN)JGzGi(0x<)rtiZ*P^00
zO0nC11uo0gQhWO!EzQO_WI3C%9BIjNQH)xMkJhO9MyaLNFKkvPI9QD)tJmNSVx%6%
zZ0JEAP6-_*PjcPRX3f}R8Y9V1^X+ff<J^1LU4wdR``1HzIqPZJ6~i%EIBtH(GD5CH
z@-5_eXn&4~H8C79PC_L{T!ci-BRQ80${duiIo`2bxRnxMTN0x>PXn7!@ICCtW(kE?
zC9u_OSs{<>4s2<H57*Ip`p??*HiQ`=^zxe|^xj1OWb-mv#;Y#bBY5dSrEe(rK&S|h
z5DSIt-!?t@6%|k^&Lj*|or_LVLV72`{=d<sF{8xn=vy56SsZdK4%aIV5V}4556&~d
zrQuW&9gt@T32=pXRt&SSli7Q3nc{eCadN<@!xAS49Fi>#9T4X&Cx<&N4oTpkZG(Ly
zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R<V;DEEG6B~z?QZiiFr-GyE=hVPubW$|^&
zc#aMVvlJC6F(mEvhR#73hCu@(bo*NHY{0SX6F3`0qQMBQ1DF)|ZeO0rB^F~r7hI2w
z)zU7QOvelNn5_p^m~es8@P73vT!}8p5@=PpXF{iIU1rOcgq4|y_hF7ks=CRZuq=#_
zE6P(Pyt<Cf5IVWTa`7;+?YQ^UlqJuN1q!pK?JH$6=AR3~u%7TkCD|4#dscRa5;}s0
zw1l|@)`Z<oJu5uuem9hayKUaUDWn-V&%rK4xY2Beo`VN!gR7lEKzCW}pLzDQEZvjn
zABx{({~-J(HKxjOQBR5#uzFw)dmxtesFst%UP+Z7xnW~vNqF|Ha36$L4jPx;XGuHW
zA?@>*LE83z!nesI@>8@N&BpGU-1t*vU&eE99<Nnvsrm6IINpOL$lGP}KHkB54S8Q?
z@%{oRFoAIB#+(WS&&hT)e{W<J2((f%Iq#CjDBcyPTd*ae;|CX8EpqiA2Gr3@Io`u|
zo9Nh6*W%elf9f-P#V+)URZ3NBwNB^zh|VWgb$~Hcxmst#u9hmHBH@Nk1hpg>vXkJn
z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_*
z=<(q<!89l*Tb4o14b2Th!RKjZNHI=$%A0J}#$+<kcIs6t;>n&}^?zwz+G8{h``ne8
zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf??
zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo
z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ov<t;U*;C%X+Y;QlX)uU_=Ual~?hV`c
zdiqT#4XG-1yi@THv2`=T!K{ung|s3yoRJaSLgW=WqO%!Ki@eHvd-V`NUG15hH88Z}
z4u9PcFp;^<)3UVFkeQ(Pa4C=b%_vR(@8^<QJob^8=x!FV1<5l_T%yWwi84SCpmoi#
z{LTed5V-0t%fw?HzVY$`nE0(O?Hdfs5(KQFQG$TDgLceiyVg+Wz*CiQrgDrd`gSx=
zA8wiQGy@0EY$cNZrDo=X?7H2+n|^sU>z;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg<
z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd
zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC<u-#$++jBpl4-pR_m&ur+M<0
z##Dt(Xz{E}0OC~7^7^-%pOONN*zgj=#Hk=KYuj4Xi@E|}*Wzi0Bir=4tDp|o)jUOM
z@hp$Q%g)|p&vMsQhXQM=b+xG5>|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I
zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d**
zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~k<hdoJiY1}
zW@T|-fYSr|mp=dw(3xf~{8TmO+K+>Qb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO
zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTx<z*E>haXjTHTk09IPq0}!
zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55
z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ
zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7<fM;jW^L+e?H#T=!uNmXN
z{i*25Ho5&a%Vftnn)$^t==sgxp(lVw<mXxx#^)4Q>-&IFU%k-cX@}EYDW(jbSTiU;
zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5
z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO
zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C*
z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G&
z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV
zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3
z^1<S;waUo^l*gRbv|feiKw~c=Kfazm=i>1uOcwV=iE!yv%gkz<lH)|6u^YT#IoaQW
zrFd;=fv%X$<D~AzsS*Nu&KQtjtcOfc81s79pO6ui!N@WcbW7Gd3D%-JX3Y5xUITv{
z>}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF
zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m#
zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(a<Pru?|?$@
z$Ed*+0iIFvy>n$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~
z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T
z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH=
z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v<c<@^Aa-GJHqpxH9agzm1Yl?
z4aftj7&4Q1HlZgXZ*wHnB1ayD#kAZ#^R=Xy-FNSMYbjV@%{azsk+9k4TujY{eBqVP
znQ~o*I(hpK#W8o6y6uI#Y!HR@)@HxmB?Xq<B`#geP0ujy1nI4RP0Jp6K>0QeJ+SzF
zDEl-W5_WywYl*+`24&HHdzIU<cNTuuGV4P>h`aW_8k@UoVD78Q{W6RDjPK*l>4hnK
zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU
zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr
zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF
z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl<kk9R!S^##1A}5zWiwhqMh97p
z4%4?=S$(yaI|s+03&omufN_xry;v8^Ohj&Y)Ps+9l(2RFI|O>6FD~C^X)}>R+<?5&
zrpz30&B+Sxm2e)E%d|-zl?0oqHk;vOGu>h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG
z{}W9#n+BWB8nS7z*!<sVlB(o7-?H!ty2ad>3}KL$*27e9fNknzh%>N7H^y32%?fev
zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7%
zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y
zm)E1`n<VR3jh;-$aLUUY2r6SG<rkKyW07O>i-BIP7BvBuOMy=NEUB~SD5pGH3NUgW
zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt
zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo<UB%O25B~jb7nVsJZWnG@`ijxSL)b3x#Vq0#
z=Ojp-2vIe2)^+S=@LC=DBS0*pZ8X{{Z|q&<E3ApUt9zE}eSCC3j?UI?Z07d<Sr!j$
z@0+8+-rVjNZkS@CvGK)xXi=J6VbZC0@u`Wvag)bXe<0eLyjq>&e>RJARM}so`d+g*
zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55
zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl
z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JA<S7L
zC&nxe(jO(Xyo<*gw{T+uI|ypK5F>QaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN
zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`R<t98b=N#SL^I5%XQSOnv*b$RKxH9
zfN63yw1)$$$PhJQH$;llKpdN`JjxY9K@mnjD0MBKmNB>U0;^c*s=Wt-e*9oE#UK3k
z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K
z#NPH^Is<dhov@3V4euxFLDlfL{nt4%Qeq;*?I|d&ra4PGtqP8@D;N#Snv>gXR?xC{
z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG
z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX
zS-V>AG;uJ!8^?P3-l><bvQ&54bQV=V9xWz~7I_AR$*hsLe52#Hhd1IxSF>1Uvv6*Z
zOeTw6@Mp!MS=aAcSG;@WDl<n`)wr=n6znNM9Wxtoh^^|_{g&1A<Lo+)0wc5Ozb!-^
zE6qB34rj@|D;Q!{V6{X{-#-RVZlLdT2T@|diBQdq1>B$S9jF+?B<|W<*Ua#Q-9x?W
z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1
ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e
z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7
zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pUR<hU>V3jR0nY&1+CC
z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8%
zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27
zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h
zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+
zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O
zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0
zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n<RP9y*>_rK|0Jfw{&qWKU#MO6SUa1eE(RG
z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9
zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=u<d-%Jrt>Smv#gB6|(p(ld9K|P^
zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66
zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h
z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl
zQI}iI=<Kp%aIg|?f&;1U&({&@k~FcDe0PaeGTULk{aNxftK?C-r0K);{wZW+W@iek
z-`cF4fw}K2u=*TUP(pO6cIczQ&&GqfT8@kC!lydq=wab?R^eOB!l9x%(L@biS#O&z
z0=Ci`w2rtD2X9kO)z5mHl*Z_1U~cuZUq7TJ?2)@xC|)O?Z??v6PA9-s>O~u<tnENj
zVZYcTcU#yEMQ?4;mG?XOTTV4!{3EK<J^d%KO@$XG1yg=vmmTXcQ4`Dl#!>d)+DLZm
zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of
zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2
z9KGe^MFW~5mGzusx<Q1m2SY{ZYvM`%Q<<@*N9=X?+I*Z1@`(q05-mQ9H6OD%E|AW0
zsa^J1NN!H0-ecMAFs14&r|Po7Rck<`1h^Ylleg5%;Ym<0%2N)}>_WLOfrI7L8}Bzc
zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ
zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss<LpM@
zLgIbDQWU+>*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+
zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa
zhaQJQqolRyA7g09$<HkEsP<?Nj}XhsdxTW?PI<6FSq?8pDZs1XLb;bZRrLJ960--l
zO|uW_TEsKh#!cej7~-<*nV{RYsH{yuCpTwZ!vRrGSWeItqsb;i1+(9rN&SB1uq^))
z4QqyV=trCl>DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|<ywTj<l7ypT)
zIA2^B{!Ez!%a~}yrHiXzV(qW3K^cl{#TZVpA+6dCYX?^09b1FCm&k(A41%&#i&ry{
zyytWW3%sX6-5QV}w+UMB^%4FWDNjkE3-NV4EuKBXSOuS;XAhR^YZbpZ-5$>ju~zL^
zO78FN)#3<NPtCD>0IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjj<pV&%Ucu%^J#m
z{tO6b&aCga+w#)LU1p2wj>y?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w
zGQ9g9yQ)cMRc2J<o620vmsxhxl`^7?)z1g_LJYozrb}j^v;`hC;tUiOOYf>wDT=5W
zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z(
zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS<h57hyGIuSnM1Si^SOst&
z?t1OA+(d;6oG36tvyd4DXZl^vY<=T!ljY&ke4f4@c{hhie?ct$V*ScE=y!v;@5NDa
z$xj?yl~>(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS
zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z
zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx=
z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!s<q8XioRxt;H5g<v-fl
z4tEwo6{o7aDuz3YQa|^)j&181Nh`nfQ44M;9K3~~EuXvQw1IaoOs$AI_9l7`M9+Ti
z`q}1z>wfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt
zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r!
zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58
z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!<P%XDJLh%`m_KP*x;w*0
zaR4+?qgwFfJIj=HYN%_SOe}otp`?Sn8*alJi#)q}l3NzaEqX@N#PDqDK&kQA<PVT~
z8}95g@ZKEV8_!JP^MAWUQinJIVUH18jC>ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M
z54<P?bNF>A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm<X{ip?QXSB$6Z;Bi
zA*EGPT&?cE{l91|t<~zJAEeb;W)=UrR%!jU`sNgeR^xAt*H$k>-~ZSJwN_iOTloXp
zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm><Gq7T@JaE~~l%4le
zWa>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_<s;fHc$_A
zxe-(t0aZO{!S@Vck7$xZ=g{v&X9<)?tXBHB=Ve)uKW~(M>5k%~3^ZbDb*f?fa)i!>
zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9
z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e
ztxeFe-|*h<xwaYa{rJq4DcH|k<JtnARiC+z<xggaB|dXq(ox~c-V6?&HN;#C%ye8`
zD4_e0^WcXCR}zh%%;^BwrPh*!Kf~TVnUY+Bf|<kK!gYmfLdPNu4ty4TRZ7AgqwyY;
z8-ku2@Ve*F%0xT?y5XromrV|IeD%@hZ(M<n&3%WD-4f_bXiLC1RFs4{r+3TdzC^sB
zD6nPoM~5eLzSI}M+ca-T=xhm(0h54db<ndqv~qvwv7uq@!UGhmREt$r+v6DU#%>OD
zIM@YH2MV(iG*rNIYjFDN0ZWg6<sjVs=6NMJ{nibyH6QmXxrdU|H}|E1g(+>wo4Db@
z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt
zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~
zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N
zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o
zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ
zUx(7<h()A1kL0yE)8w@oT6paS2Tm*do_^yE>Cs`#dIg}nrG48lpMfaK#!L3}tzChl
zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(<EgkMV<6(=sGYqXX4pScy8`q;iWZty2
zY*!9^${Tp93u~YDR{nV<O5EPM6IcASw}$nVx4pGjqjA}1Um74|AwbjY%S`(+%f4iQ
z)wJ36WsZG0Nnf_N=HZGTZ|KG%Br=71awO{sy!N^G!S>c!f~JUb%q!1YUU^$Y9Yfv?
z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o<z7P$*;zn&FD-o)%=
zQze*&52wWgSV}5R@^W&MDc@r$GF(a-Fu$b?>#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv
z1Ku<MX?6f<ax7N~%>tBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4
zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs
z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y
zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2!
zd#(e)AbJ0Fwjr!wVHQmz#nmws5?a<n^Jq_lW)<XXwyh$nHgDH}wVfKnONS<_C~G??
zhQd+XKu-hfZCz}Z;jOidjekKCZ&*M>gmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh
zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|%
zD_b)&@O4JkC<NQELg_X{>E@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r
z;uHA7@#&eY5qQ17sqy_<qo1gLih7q4b>0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1*
zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz
zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3ia<ssPH<e}18ZVYP2TAzvrkoB3
zoqeHYnQVt;UL^gaXmsVE`A+eDIHj}2JzCu`s;s8^<tpsr+5`jYW6An4nqb2hO4ggl
ziLAFH2_HRxtPjkwWOeTLzfAf;maONQUd5(qd6S~}%}2W=))WcMbV9dNwUQXs_AU7=
zN<Na)hfr=eP7M(Tb|MK^2d&nX5rT)Fy%$)LAL+32?<jd8{8d@m?ho}e2*f|@l^<@&
zooKQRW0gEbN}gGiXU8>r%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd
z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB
z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l
z<))}|JaV<lb!@`Or7O(0D|9WNtpd75c>J>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f
z@f~YC3W9<R?#?0)sUr6~i=2r-fo`bf;0i&PkpqNuFv1w`>E@Gt!iwNUnmQ=d)JX1+
zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K=
zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv
ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|!
z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ#
z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus
z;~I;<?>5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6
z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks
z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av
z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraC<ln
z{RP#~qTW!5j8Tm?B?ZM07ClBP5(f>h21|sFabPB`K+jX9A<CP0b{BQ;M1k;&&|D1`
z3oTj<csEK-x($?1kEH!=CO3fzUm2uYqa8CMpaWY}Zx6io*~8vIhxe;a>~OM_$e7Ss
z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i
zwFPA}K=<cin90NZ*_5W`DJIr<By;WH$WP95BRVSzWv^mdS$s;0DW5{g_SPKyDxW$D
zR}P<=gskmTd6F|%xuQD^NuUa@vrs`m<POR&)}>xxoniLpS6L=if0GU|4d;=vY-GTf
z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;<U626=-X9>72$
zd0^h+{ua%k<M{k#dN}AAs(da*5-Vi8Wgu|epkSWOE2sV(q69p<aoA`yj`jvqE}lV~
zt3ff8J5>p+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7<UlF?0vp=kJd
zTEM)6fPb$PJj3K-+N=@pGe@WvBc*Jgu7nEZ?!-z3v`U}_C=huG?C|P5dz%lOK5nKw
z96hFofkWE@I99P&mY(eC_EX(z&6&iZ+^3z~2ZO$6wl}1QPnm>tksy@&4=3Fn@MBIw
z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk>
zyahXe%?j%@d^f6<V`<zF$5SOy{dTCL8+l1VcpezZU*$Vc8lfF>`wGRt!A8uWP_B4a
zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D
z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra
z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{
zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS
z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0<
zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KD<E#BY_
zq6>oJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;;
zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i
z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C*
zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL
z7KG})b6uSojYg%u(1rM0=6}<wUzGx|pBg1^fX!fpo1x_c$ZIV(FM3bZfTkA<)!b)n
z*~pK7T(}_c8vnj<VQ9>{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g<Zd(qx{S7p7^Hu
z_gnFo*zt#-@dw58qa*O=2cJpsPX1@(N9by47tb~Sdhj!=>^eVx25m)tpui>mG74ML
zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s
zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I
zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK
z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k
z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU
z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE
zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG
z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp
zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC<?qXB5@fE~qbENLlOsU^0J+ySM>19aC<=
zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v
zlG-Fx(qCWdhUt9`M%gJ<r_`R(P+n=+ib_v|!qV{|>-^eASkwW4YnS?~Y8J3Lw6WRQ
z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@|
z7;wJ&(lW|2VZsD6=PN(LC<Qj%V3YYlt{Mrdj9GyHZsnPI2#J~eUT%LoaJ|-%cjD&t
zgXbY{v~ij7-+vfElkpvdZyLUI&7t^a;EM&WF&tmcvk$;G3*UqB<$WofKyx%r!WZMT
zyn%Eu(joW`!`Fo`_9cw{@#Wbl4!jNAj9?ss@1gh}jxRo^^QWq=tYJw_SqXacDQM9}
zrPWpCZU7M$O8m#B_ruqfm7ojMmbrcP^|kesSChso9)oZ2Ed*V5PScpCap(r)+`Jll
zia567Goj3=Dj$Cu2&>9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j
zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S
zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ<?#
z*hvk7@eOb4>!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x<
z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~
zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ
zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V<
zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns
z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{
zwQTzgKtHg7wEWU<IDLuJy-d41=rDo9il?duj*P+ca?JD^;X|Bf`O2%8m<_Nu|9;ZD
zRe=9tbULuwdOGA$eC;Vy1(!~xi!hUwzNDos_ce)Vx<1{XmF8BrS25g4=cq^N&6S@+
zcT-){PR@9&1hdWW7$L0w86(F)1<VSx+?Ky7$@|;X?gIOww?$H2jCE*w@gKeZp-Z?G
z{0LKwcyxXlbS1K>06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4<n<ud@Ct+ELEz4TTO
zi_t4a4#~(|$7VhdUz^Tms_80A>&r_tQ#Ek9v-QM@Z92|<zGv3#{KB!O{ppvd=&{nB
z&4IHD^UL$S(@VInx5w*#beu89kbOVD;u)G_jQ(-?n&et98Tba6W}EfGpRo>j?Wie#
zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A%
zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=<D#Sf0q7hv7ZUE9AETA3
zS<#hM`Y^A9kmU&0B{DnVirlSrvu8J^xSGFAVQj?|Grv1~OxajXsrgseHuz4tKv||A
zSGXi-_d(|vLnThLJ^S+`%%Q^q>L3#AgRgTA*-tOUe`c(_j<F$D4(GZRg7{5rgTJ;e
zW}1nxt>lsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx-
z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o#
zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X<Uq5tO{*2m3ZejDB(GwLzw0FX0FAx6E;Y
zNv_*%Z~hnL&zw;(<Lo1_y~oD67%PXlw7fK!s<+{C1$Nlg!RM4%Cy({60qkhHZhwb}
zv4f-Cob&@Y#$JA5)5kc5X^D9BWu-No3bJDNLhjl7v0&qt12>_uaT&LqwQ0C-hfw7k
z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKK<hQS{wn0EtGGFMf{aM7-w;5gS(-w
zw9Ln?kt4)C+{e*cbFmJU#i!!y@Dp>z7Oz6L+QKy*Hg0(<MzJ6F_cqht2B_C6H*joI
z_32<`ZzD1syR*&CjX}vtXR)LxTz5;bQyRpj>nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq
zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx<mo3H=mFEg-
zr(nzp*k3*g-v)Q_tTT-<7mgpZaQv9^@ngKFjG2DQnDgCZF48@eN$=_2KW)7Ev)3*^
zjk)IK6Y`n%|ESsB|IIvXAm069|4p7pS0&i~$IdN&Wy}#aCcMPjjw`LEcPzodf}DGC
z%L*mNpQetQRG(Jm1<c)FCr}G#@wMlB11{AqI>)SGAVQ1Ip68YDkYpfQieS-8h+S0W
zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(<Je}^R&dCn=F+xY<!VRLBGebQJj(oW
z@wQ!KNUZW=C$?k^+OFSWZzn};c?sQOXOqUlJ7<L^5~+o^?CwB1@wUB%Af_B3?!oEC
zQR&9C{8{-D2ud5us;Ww~56W6wMWu!sIoZWA(wOlR@v5gW<4?$Gz(KSy#ZTOFW-%Rz
zW9YbB!Q)6PRN+`nModHvbE97G8GLU!IfubuhzP{9MNB^S)@Pvl*mQOob+X9QKqy*W
zZhPPN)-QZaXH{&J6|MNHj*4Ve&apik2boiO@<R(C5M$xZ151F$nujs4D}uM9Tq%ml
zu@3e;gvx;~t?=Ve1K96jj)Oq(Jqs!-TkE4voL(v~tZS@pz|0Nyr{oETB%L|}hjPns
zopw<)IuF-#=0>9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH
zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU
zHzGYA=?<jJknTbHGSX40hLMYN%qd7$;Krg$kUoL*3Zyl-UFcS%%W(tNMx=-0*1{b~
zuf~0^dyuY2ItmAm2jL#)DM&BIGZ&X2ZAE$o(mx@+73sKNp*+&7k?ug6aUIGdJqC|z
zjl#iW`wb|M^yr&V9%(PqE0Au$L+7_5J!3V>BfSgh4x}$4-GlUaJPbbyMqh$-3exM4
zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~*
z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs)
z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$<NIQ_WA^jL>H`1niP#%Z8e?sa;y5L@v
zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L
zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh
zX))5ZNPma46RB@YG}?<af;1Bc%$3{F50HL~v<PYQ_Gq*Y={-nSA&nwki*(dqQJ#6E
zy+|K;9_4Ykz2<Kyk8~5#BBXUMpghuJUqpGN8<4I=>ghyzq+9-u@<_8@MtK}=*C5S7
zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_*
z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx
z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3
zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C
z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw<Pz)pJoF>-
z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C*
zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0<s;BRLOScut7-&CqN;?
z4Uix3-H_o~+Eru+-qVtlv?+-s!h>&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9
zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY
z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%`
z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o
zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8
ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!=
z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&<Z-iyPF5r_^VqA-{-z}E?`hai2vz{d}_PNNi
zPZs7qTktI9b20UAv+AD&e03;3eh%;lU>>qErhJQ4z6tnW0$&<~zr@0?27VLp<DB>f
zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K
z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z
zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6
z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRr<nM;`Vt+4se0sc+k
z2l8(>cs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7
z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q
zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H
z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l
zzgXG84cHpsHv;ePFT<s_3|m1n6l<kF#@OjT%T7Ci{~gv$tugp!3!jXM+jm$q4RO|y
zG{_oDMgt$gn(2I}jEOTWJ52+A6xK}VJNeU21yY6BsS-5j-W`o5$J7zEWLOFO@%Kif
z$Hd@|v}Cvs_>=D&xUFdC4&Yw{ew<T=q+=}pVc_4xdTk&bSUzni#w)Dbu87f5HG~_m
zY~bsGKhBBQ{hc84C<M(C(8L=Ps)4_4ANXs5-wgcoh|RXYw-2JB$^RkX7d{fdKRyrq
zOTb?lW5dfW8}0%==g-mTjo@YRPh2JKNd7~y=Di2_mKgjmE&Mp((;kmT$2jrx1g-Q3
zei-nwgKqZM#A2&_HSlKxe<<c2R{cp?)|~5F;HNzqjlSpPpZJzl{vqH~vG#p127j-G
ze;)Yjf&VfF|EY!F1$^aS;`ha&ShFtx{*jpS4_W2M0pAC_Ji=({_qc^G1pfMsSkuOo
z4_f$Y;1i$X({}h+<yTnvYk^+~{L+~AYqHw!A>g0e6phY~DX$g{Y`^D$H`?OkcL9Gg
z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7
zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv
zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l
z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls
z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa
z&!2y*Qa<O-PudQ*q<rR`A6YCT<ummBWNJM3H~2B}Uwjv>Wq!3@1mieIKAoC?TWI`r
zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3}
z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H
z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!#
z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr<bdgS1==3I?-mBAfI^C|*S9SV<PQTRYAnkAm
z==5lvo~+ZebUIt76*^s{(-k_sNvHSfbe&GO>-1HfexOrj_5b`gt=p{&DF<#^r?O|C
zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU<PMVN&(pUvGrhr~KtLh4!+T-V%JxxwJ@v1Ac
z=jiIpgd1*&#*(AVuGD#h!pLTwN#sZs{hZ@W`raEAQa0-xbmnCvIIGT*me2<oh8~2T
z^d-J|rhu9<F(K_;D3X|%dM5gMVsh%UpiWFn-O40k&`#iz_(vrNjTws)iHRw>2mwjA
zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W
zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB
zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk
zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ
z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0
zA)vZGAmd%~xuqE#ve<H)kWCtbmKbsf?cvHEa^^`Wnc0QR5Kj<fne&J}OQ1KRDTbWg
z44g5TcAmycSne1j>3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6
zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I=
z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_<m3`a}u#Gy%Q;h6aflw-Np+
z4Vj#gh9=6`eFOS+nvs$YqO>&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr
zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{<PUzo}0V$kXDK4gvBv_3s3(
zgkCBC6d-?7KP0dgjhgbA0QsA`hrq`mL`o_&N=xH!>VEjy?_*kdlmPjgdbC`!<x(fg
zCEu5qIz=uy<48SQF5l<nOu77qmviNkc1^96OIjthK`z(x@|SY?H(mzhl2%H+K`y`I
z<sEW46u+qt$mJ2dd>lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk
z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_
z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e
z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p*
zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdP<J$x(hnG+<lX82#3IbJgN4gWhcIg(jB
z{B>qdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw
z9mO;6<fW0(O^&~(=4lzbNP4dXGWcROVQJ(c(SRCAG$7Q6j6h#Bq5B*A<)Zjd-tC@w
zdII4?8)%3;bzROn^W}QTryv}DHR~KQ8%7wuQZ6UJhQkAL`7vy}|G^Yw@Vodq9*T}I
zQbr>g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG
z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf
z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;o<RJ27?gx8h
zye5~)#CGKodJ`{SFT$lU_(6~pD9zw92D3OZl20`Tvq}ar$tM}ZZbXfl880sZC}Su2
zyfFxuh$|!-axlewkH{hP!Hk`}96T16cjH3yJYdd&nZpJHn}H+T1BcUCM#dEII&lAO
zxJ=6!2@@PRf+YX1wJ(8>s=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z
zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r
z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0V<tPN}bUd
zA~!f;BI$H(=rN+7L~s-Otpq%RL2?-*3AEQj`Uz7=mI=jyaAXXYCZV}#VYq@OVio!$
zY7S2$bY5sO)p^8^kUKN<eZn6}a82lDNE)6=@Pbec;VTJl2$A0m&!U<eL-QeB_$b1!
z4Q+#T;VO#Z(Hwe-;Obhy*M~kNKC=n`Q0S+qJzPVr{2)y4!Gz}!URmK{G$%Zd5)D=<
zzJpfbg{K2`p%rdJqr!_R=Ta*?lK37+iKngbB+6Mvf&>GqfEq>_(xD_22_R{rTk59m
z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt
zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N
zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2
zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix
ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;<
zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0
zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY
zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^
znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE
zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D
zx3+7seOH;*zOTSrO}Pd(#{?)c<!+rMv!f}aypnicGJra1Ia9s~fORP)zlUVxH$?n-
zJzDnMITTq7`K>EV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H
z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53<eg>p9L8os*dfM$IcB#qVb8x6)
zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX
zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M;
z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2
z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`=
z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h
z8t^-)q8k8wlPU^utNv&zvgWz<q?~$=4j?z5Df@_DhxG~Z8w>nTiOmeoe3#ik(xw7?
zi^HLcI0zA`iVmyO(i}b}4z<i-M1^j#bqolqsU7QiOdX&SyLO?mx*12JyL2<w0jNwd
zku+nj+l=4bfP(ta><%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj)
zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZm<A<N0gN@?R658JOL
z<Ee~u*>OgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D
z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1)
z%Lt|rHF^y8VONdRGe<BboPj>9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q
zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL
z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~Z<V|`Q;X>A|XG?gi{bVAq}ZrTeiX37S#
zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo
z6>52R1E{<?H~+slMsxH@7(l;fN?Mj!i>(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe
z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU
z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W
z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE
z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@<huYok>SOK@Go
zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd
z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q
z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6
z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu>
zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w<V4
z+Jira@q2+EcJXu8dYMK&Oh>?s%@S7CcX*gi_TYyYe=C^I_29qE_<Mj~qw!62>7wR*
zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F
z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=<E9Qy3J%?x_J2h|52bjp}
z`<bE)*1k|v@PeP*bN$yi_c`RA!MRjx6V31S&vNeJ;7+p=G%d4Hfql+CBa*7mhd^{h
zKz6NDg{oh|_@jXTro&v-zsLB;V9`Hfd{S3>0V<sZ3>{SI2PU_`9=FdRAz6X_r@bbj
z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J
z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym}
zGmgv6Nm=`-&*o#Z7Ar<q>{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H
zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$
zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@I<u
z>I!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a
zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN
zCK;~+{S<v)LMQ9smW?d6#O)hMPVH(NRF+O22jeKO!7|At=_=t^;yX1~t1SJvVkP{~
zu?-qkr>r@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65
zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r
zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR>
zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h*
zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U}
zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w
z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2<EME_;;8t;NJ(0|$D#vM4(f7!Ce
zyCEn2hYvLV9n#T%_>smtz@PrZS2kV+4E=}qHa?7Aq5tqptG<W^(0|^<K;!GEmHtat
zH~tX0^j{ilyZ}t;zcku-8wlxt?3;}Wve5t7ry8e&KmAXgu<F-1(f`=<tyS|O7yakW
z39MEqPyda_qX_+%9oM)E?CHO3P9ueYq5rZI)UTn1Jf)wwTVKPqX3=NxGZFv(H+)8e
z-KTi@M&7tFui5ZxHF@+dP2Lq7Aygi{OOy8n0!IY%gO}nwavOg5-5HHYf0$lC57UEx
zw9cT?xYkl5UR;LXQP-f|FXq3u5T`<VuAcusQK&+CuAcud;u$QY=j!<%5?owJ&(-rk
zA~;<58h-OXCOA?^&(-sFq>!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*>
zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c
ziLq83d<Ki?xqAM1s~(99i|Dy}{se0U5<80Mxq5!sq8BT67SVI{{7DwQjCpAhJy*{^
z!s-Lr?jm}wo?m5s7K!VN=(&1+wNBhrM9<apXY0i6Mf6-fzeXnx6w!0_{5tEKDDzNJ
z5p<tFN6U7wh@PwG&$Z~qq$gD#Jy#z&^4lOAP0!WyZy2SYtLNWD?JfK%@cCaMP!$dn
z$D66;!9sejo_{O#R55fzCGvyBBSe-!EjVl$0KzUuLSyNJSB)*fQ+*UKpy%rO$616g
zpy%rO4F+zba!WOwAG{KY;0qubNk3!gIeWn+B*Peb&R%e-2I)C_!DSl!3v@lIj06k4
z0xEj-WvmL8;53GwvyYlc7=Eva-&!i5$GC+Rfkx!y2gzv0&~x;n=^CWx=tXtJa11?1
zFPg7GdX8SSh`^E|jIz;%1Qys24M%W>pIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA
zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!<r1
zBjvc)BVda<{y~)G26?{}r<-h}ryAq|MvmVFq&1q5eurZ1dk}UpVWTD>jlG8HH(Wx=
zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i
z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54<PW6?c{9=Yh%_ryl<D8I&3
zI!u(M+li9i3*17K%>ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62|
z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ
zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs
z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB
zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7><Bf@th_WGy#jWmS$S!8ny%c?
z%_zd1(g6NLilCWAH;$4$Zu`^>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6)
zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y
zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O
z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg
z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJ<yff5zX*Qp?-2wh-VD@rT?qoyKbHYf_~9X`jE
z((RHWMvPcdVi2QCEIU+KNOe)k&}gT?0+c^&g5~C63E&DfrUVJ%IF_;CNF_={a`^tK
zXy@Ta;%9t`hs^{eMo4{l4M2D<fCni>GHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P
zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ
zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh
zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T<Ax
zE4?){&#UG%CC`VFJ$geIqJ?P6LINhC=rM-vHOoqCwV^KZr0J61=_6cx8O14eEa^e1
z<5;iS(i^-YL<;M57cX_Y7+0XERvqspTTvo}%N_IxvXh9`(Ft4{-H5ADw;{N`s%CXN
z#?(j(SzMY38Dm34+j;0UT-K%I6Bx;>(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h?
zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{e<PYx4*QY@o>jN4VF4tjAwm3XnF
zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;(
ztlTpSv>`;y=$I<xGy*D+;E^#cU2w6f>8{NDp!!I!4b5<So!lE)#!O=~mFYR~0ci+W
z%uy~PhsC6elr|)7R|@KoJWmqJ8<DQdd`}X@BR#4GanPNg3S`Pr1(>J<`ZCC}{)O4Y
z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g
zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L
zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+B<yfkd;;~mH
zv2R+utx7UbWuM9MV3OD>jW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C
zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q#
z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I
zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb
z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8
z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx
zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR<IhWmxkJrxY|m+i^N+)!4y*0Tftx2
zJ-gpU{omQX1Bka&@L~Hdi|B9L7O1p<y2TWKW#~pE{-MAt&;FV9HzW=W{m!1-4ALJD
z?MCuZE0`Ksv1BI@-_yj8S*i1ocvL5zuu_{i_gfnAEOP((g{w9!0?F5h7K6bHR;n9`
zTZZmL;uR~^vp;WyeWT|6M^qTFuO14HMb@9Jlul54{*FrboI!P7s<rTe&bw6e`@q`a
z;4cqQnfJF*Js(@mO;?|D+9g1Iw2f4^Q)hmpGcOMWZ`72ej*A0}0sm_o!FvP22=w%a
zZKQ{X1HTI{xy~+p`Ye0vjT`Mpw(qz*SiA5z`>^(B?)u^H9}U*-(WcOuXVclfatIxv
zp|`EzVh4$KYUClS`M;<MKiu{-ct4LO*{vkRb3+fK@Z&15o;u>k=o=vas<n485U8^5
zLh>;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2
zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{
zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6
zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<)
zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56<kinHJgezz<JpAK~26o`0y=u
zU!cMY1ktCTNA&?bv&hoFDjDy_*0@HsB&1xAho?9799(tLlHigRtM+{JjTiSk2E(!j
zf^X0%e=+W;ZfI#}I`^0j8wfF*eiqTsDtqvk_NK+(skJ9HQ`(L#4z9AN?6BJ+zytyi
z2RAHIm>zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp
zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOI<DvCN_M22HD%9C)E=1ElS-SJ2h(sZmj
zBx9SgIW}rU<%EoO_xASDW>YFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h
z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9
zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@
zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@
z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l#
z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP
z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO
z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H?
z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7
zn>2+s?4mD{wwNl&K-<yjkVTJ68hq3c8ekfuu#<X6f2;>?V>2!ZQeC}?sMce$tEZ24
zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E
z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O<zKQU!Qwk1#++-Bu_3
zNd#l~W}LWI0Mj%3D<-$Pwj%qJ;fs}4747YAi}tp+!=s{Dn=w%2Y*u=AajAj^1S9|(
zTq4l^gS7oD?*geU5u)i1j+1|vu}6kAr;hYOeTYWDpTH+~b(7BK>0Vo)&`oXe&0Ve9
z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)d<PDSzx`UFny-w
zO&`YGc9h2(8@1j>JDVax<h~emu1|B!sotFC!sY3@bziF306dpiTNKjZ5(w(SP}II5
zN;@K|)YfPZuCOt5wqUzB@WH;UXfZaVgZ-*r>ShcF(xr@t<FzRco`zjt$%>8JL2r;f
zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*M<fXhNouixH;WD
zGG*)P^}I+<umkgat5&r-Y0dh`@wK%P9X=#dg~8U^xnxtK8(>e*Abu0QRdd=By;}e#
zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGD<zeLtaM18H@xUeOlx34vk
zB7&A4L(sjY4?uelk%1mSy@3XMDWj#Q%OsN>Bp;F_4hU14L2NT&@ykNiS35v0GP<{#
z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf
zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#<WTf5sar*s;2x&{(T1Efd8
zsO>Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9
z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$<gzCQyut~hrddRB7lyv1tkYZlai
zdukvNU($gkPk$HM-w~-g1;*B{T~!rklN6T8z5P{Kx?)VxnuYueIZN1k3w_hpmW(w=
z+gfnD0}e4BOAOEgK?Yb)yr;GA{A{xl6yJ~a1<moAgL}6Mi(tK=M90wjX-jmFro0tv
zkB3)Vv1Ub+Gb)XrgG(LgGp;VGnUaLY;cEh34%14K7ae+<=+QeDLey9{hO23kp1#`2
zOUld-HzdxK94}1{$i=kY!7eq$_q-G@&zu<svwwdIZ@$g$?CpuqHZg-{L!al(MssUx
zYcNQ$Gpr;A<B1M*+3aLLW}|2~1TZUjOzcoogdB~#6R8?V#yaB8T`TGcnjJBZZcfGE
z@?<K*iWB|_O8~PPg)({BPLrQ%=T(iFf@ozyiy2xZw#CkePCE6B1$k)RNkJlK%|n{d
zOP4mfK<&nSo-~UOOpuTgs|+(`>-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz
z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828
z4yHz0&=)o7q<XeA6G?aPmaarQ7H>2Y)~a?`FfB=F)u(3wnr#NW7ZO<I4PuE%MvMh)
zS3k<2-Ny1EBPBkikzihd0kvu)NhFDFR{|5N)}u<cVjtj5Dn8g{OfuP-ehZ-stGYoC
zitG~$M18^t?ZQ|?*JVsr>E6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0<DGm
z;2HGQjB_n+0F}Z{$ao4%cMtIVgwK{*FJK4m?$cqZvjyDEs62(2PWDUc3_KNJLILUR
zemV@Hq~!A$a1>Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T
z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j
z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy}
zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=o<wy#X85W-9UaB
zNX_#3&Nm&4T`Tp;x3893D+*lfC~NJfVJ_{HkDDzO6>ymA3#i)t0=sI6iW(kt+8h;7
z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+<YAk0bBtncPqJGm<Za-f|gb
zmdeode6C}WQ`cGn&v8&21&leUbpndg(754kS=2SWEsMH_w`I}I@HT&wPUGs%bgEq|
zpj2<EW&u5tyqQgsH?v9dW;RLQ%#fsszbZJB&oE;FuXa$*01x6dJvu-opnU8HH?WV#
zx;wznHstQFe1$d6;l0+cTJRR0KX<EhSr}*sYwmNd1_cy#VYp}VKF8l_pz?PaQ2y#9
zHiXMNqiHoP>^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z
zn=#^~I+-`f1<FTgQXk>u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy
zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j
zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3
za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1
zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV
z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD
z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I
zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr<tL#;GWhvvv+<J&P23selh7d<Zq1t~
z?oC{oH_z|Vym@~8`0FPbGu`Pt$DK-jMCY$3Tl?hGYguSO7Kvbzp)Wrz%;M4vvk*ne
z<slgMeaW;cgQ1@`+t8EHClaXy{3Mb|20uS-HdiBjBtQxHNdS`!etz0)@(CXaVFG?1
z-H@Zbe%fsO=nEe@B+v->&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd
zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw
z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<;
zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7
z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2
zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh<yMx90jw9W#X+4TU<acL)aC=*5y3G$X}_R0
zIayn7=PW^Wb5=<087w$h7Cd*7DJuB4B2xD`t`ZECDPEI^Clv?O=0<^~#(>(=Ah6UD
z<k}l22}bHA4*#iQ(r+ENV*#Hnb*6yJ8I`ZLsy<b*U($_?4+6VYRZO$?pU#+#E)%~z
z!o)9AU&LSr`8>l^rfaO7UNL`q-So}Nug;rxC{Q<j(&omi^Ck_T%7cab@8bdo`8+jy
zK~JgurY85--w|GYi1wk!<pPWLZMEf*?UIm&N>{9Qy5jq+4Px;XmQti9oGXZd{+&3|
zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38
zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN<tRXGm6mQt9p
z1gvIKOEn2N-ZD5d5&2m)GosBYxK_ZCToBJ^3K$e#SsvL8GJ{<%BYt;R-Q=QfVel~*
z#gKEF#gNGFY07NQ%HEXBg6oBc6wEU1O4dr|jk^UO2q#A-0DihGtVEX-;f0QWLn1nc
zB@5`5%9w0LYX>!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx<tI2#wco`SC
z!YO!~fTJC&*&twvgUYzL&ScaijBa&!Hw$>5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t
z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW
zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF
z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX
zuK;~bPRH?lK9SFB_$)Vd<f)iEr;{5(rnLJjasR%^<>iLUZJc%~4J7zGIsbk>%j46>
zIQ<ho%eA0fpZ#z74f8{y_xb!0pXD~MpTETKp1{|B6?~Rky?%O$ge%V;nwfu+&*$@5
zuCM*&CCK?3d{15Ot;<buKfT;Hk^2<>dm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn
z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW**
z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay`
zo{hQorm<d(fw2jWI&^q8e8_p5$(w7d!_@uPc`?GLSRKrYAEU14C4;w=jW|jN9`Ry?
zPl>bts5jSE<D7j*)A5I^@XHY|O2alRTlNX{X4(p-JDiO^tRh+Q?1{Wg0Nc)PsOcHD
z;yZ=uXwdV>&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_}
z8pg~1vq<XWMEqLV#;=j}NpjFZgWAf{j~4(>{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl
z2LK#0oSp$}q2G&Ho;;na)C)Lg<MW!PFIElA=XR!lD+m2MIq<?K!fQt1^Re)Mz%Zu%
zpko3QWg!CETE>gBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3
z-!g<eSEwH_zJ~F_|24+1WxU8kdqz^bIvEeoqK`4~o7wa~H3z;1c<RUSa733E_qr$t
z{Ruhn>vG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k
zCxw!<viLXJEo>~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4
z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>d<M_Nesg1yofj^NAg=GA;G5x*k4dDk&
ze*xn!Xfg=#m|tXkL#x5dJbwe@A6sGYHz7^OZ5nT2@9+1S{-litE7Ke8VoCaWZjQm9
z%6$HcbHvMl;X*Wo=+~_^2yV8T06eu<<|DE5zapSRB+LAD6!Td<&G31K>z&8=gM4L6
zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?)
z=BckSpNr2lgy%5*eHzaR&*|A5_*a<EMSO|6o!j*X#;?BGFkv%QZv#ID{^!Yk29KmZ
zC^{?2|GhIzI?Q<5c9!Hn!14?I7~qScf9V${S7G3Z&(6;n%+Z!PPt$mza?kTMe>W}A
zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC<d4m=PP<ssGgHs$OU!37P24zSQGYMv
zmtAb|@0*i)DhHq6=D`1*`8<A!VX&L|jLJ89%U@^^a+7Z=<4>7y@DH*+=QI8l9w^&b
z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q
zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ
zfbIWXCYjIp-**~BA@f<o_&@V_U(WPrYJ4XDUZ!8f_IxyxY|TM`MGide>Ph4Ae&2Y!
zpZPy|qG8O_nEC<ZFXgMrXSv>=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe#
z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O
z9Q>~WehlJzx%<bPQa3UG@9_Y=nVH?g_~^+75oLLv$ie@Y#2=cI>p{`;8_egPI>Sfi
z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93
zobhKf{v@`mZ6;fFGG1<?NC^7@jb|dyX%EvkTxXD6z4{8{5BSFWgE{#8km<+qJbowF
z@e1P~?=y(+G5*aQeC$z1pZ6>^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO;
zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W<M;A9
zP{t$eNlWrSu-Gs@kJn4TVEi-efQ6p6ASC+Z*nxeQ<@u2Dudg$Rsf-U78u|eHNxHtJ
z<0#<Ck4gMx85eb$-q3h|PhvjB>}Te4Chb<J^)?*wlB-Qj|7DhEIp<!)c!Yq^$Hm+)
z*E9YqUO!8}f1UBp%|Y}B9d~QIfxW-aFnxK0!OHyjE5`fx4H(YB=Ut{h<8y}3JzUA?
zBBRgJIR?SaQxV25*>3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5
z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP
zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr
zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T
zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV
zvr`Y|;PVXA7r$=k+5M}Z<e-0p=&`QwUmv}lgPuZNQ+p+hl|1+>0p8T8#dLnEnEs=q
z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO
zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr
ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h
zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+
z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ<
zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T
zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR
zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV
zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt
zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&<VR<Q2!K=O9c9LV`u3
zi18CWp^-T>E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2
z677gWUo<qFw0gy|<D+%eb=7knV~BFZJA_9<v2+@av6PN97O7*pA;twlSczG26e|ie
z#-Rv3q=<Jy5vwwyU?EB}0(r5tCXNP|>W@<pEkqun__!3COotH5kcFaWb;iM(qp3tu
z6S1TC8}Wn=VnUecil&nh7)u6r)5*uvux_G!q3)wlC<vB<P#PJv5I54Q*C;4#qLf7*
zRZKysK$=db^C*xWq=K~Ch7g#gUHZtB<*;NF(9VtOVM??i?iSjwHYXD%##Fj^h}hQV
zlNf;z>kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0
z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR
zCe7^i(ABoe;!~g`Z<In$tF#=(I2|E$Y#*kP5OCcHJ+&#lK12mVs4YZvp*U^nSzMUn
z-{^Qp6wZ!uT}cyyMh^_Ky%Zw+a3HZ%zuUEx<x!_2Tw99{qZ3Ef5>hd}Zr5>%RU%-S
z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z
zqQ0<pv@+6sg|P@_BGWip(3)W@(eZnXj|CSRhT7jUM?D<c2cToeL4jnTPsbqgwv%Hs
zrHju@yk_a)YohVNk#+PSHlJxD!l~+*PU(EG#7Oo=I}zUoPASdPw8fo~i+GrA=oj)m
zCM1^2qZX4ff*0vvkx>e1C|$|s<i?uP@dhPV8z6iRg<tYU({UNksjl~!34<OR&^k71
z^$rW_Cx7@pL<((PxG=ie1l)70po!U;(P+YSr<G2xF`HmpZaEs$X{y3vOSc>Sr|ZY`
zYpTqw(bGwer-^Rq?cXGt@eKiPxK<tqYcy7<BPK?Z-SK!I(dZC@D9cd=(}nHXMDeN8
zaUMZPmE^EveF)bT=P-tzd^2CVX6kqV=>tNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN
zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW
zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE
zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS
zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c
z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7
zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d&
z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E
zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P*
z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1%
zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~
z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D
zCt}PyDO1$kIx!^#MhysC-wK&@aZF398n<oq-ID6Ic*{TsC}W8Ztdu#aqcTcXOAAGs
z=M)7)6&P9)5M3Z3bIQCw^*_Z-L;<?hDff5feQ7yQ@-gJwq7HT%AbLb3<>kF<IY%5y
z1IYbNLDvHCmY4Uk<UCyn`Rp&h9QgM?`J`9srwwE2kTXnAA0zlI_v<*R)&RhxCrAG0
zxxAe99*W4P+ppAiDIW$*2i@MZ@h|nud*5=tK{ALKhhKm6c<&RB{PKRdoG+w8I7EJ7
zC+AJb``A-n-XoXuZe7~QPh~`YM&nyidJof;U*0#DbA$z!`lY<Ie;1cu#th}XbU6=m
zhf#UD7w#|rMc}9kzP;_dzkVS<Y?**O@4@UG)MbFY<!|D0a$ZZF<2i7fRv&V{8Yypi
zc^_WRA}<$n<QM#xedXo3shsb<NJ^Ts)GOFqeC6f+dO729e%j%0{~cUj>_OhUkn@X?
zwBo8EJ*n<Nn%Xbr<$ZlQkMmXNORC3w<>kG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj
zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79
zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ
zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK
zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0
zgx-7yASw6!ua$<f*f(JP<tZ&=-z|-IOinSBH<cM|1x~B*FX|BgPyR`4&rq7*16`*4
a$T41QX8%!kw(<{dHRaDRCx?kIss10uN5`!I

literal 0
HcmV?d00001

diff --git a/legacy/dsaX_wrangle.c b/legacy/dsaX_wrangle.c
new file mode 100644
index 0000000..19507d4
--- /dev/null
+++ b/legacy/dsaX_wrangle.c
@@ -0,0 +1,378 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+#include "xgpu.h"
+
+#define N_INTS 128
+
+// global variables
+int DEBUG = 0;
+const int n_all = 3194880;
+
+// to extract autocorrelation data
+void auto_extract(float *output, float *specs);
+
+void auto_extract(float *output, float *specs) {
+
+  int bctr = 0, idx, oidx = 0;
+  for (int a1=0;a1<63;a1++) {
+    for (int a2=0;a2<=a1;a2++) {
+
+      if (a1==a2) {
+	for (int f=0;f<384;f++) {
+	  for (int pol=0;pol<2;pol++) {
+	    idx = 2*((bctr*384+f)*2+pol);
+	    specs[oidx] += output[idx];
+	  }
+	  oidx++;
+	}
+      }
+      bctr++;
+
+    }
+  }
+
+
+}
+
+// for extracting data
+// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
+void simple_extract(Complex *mat, float *output);
+
+void simple_extract(Complex *mat, float *output) {
+
+  int in_idx, out_idx;
+  for (int bctr=0;bctr<2080;bctr++) {
+    for (int pol1=0;pol1<2;pol1++) {
+
+      for (int f=0;f<384;f++) {
+
+	out_idx = 2*((bctr*384+f)*2+pol1);
+	in_idx = (2*f*2080+bctr)*4+pol1*3;
+	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
+	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
+
+      }
+    }
+  }
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int output_specs = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:sdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 's':
+	  output_specs=1;
+	  syslog (LOG_INFO, "Will output spectra files");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block;
+  uint64_t written, block_id;
+  Complex * cblock;
+  float *data = (float *)malloc(sizeof(float)*n_all);
+
+  // spectra outputs
+  FILE *fout, *fmjd;
+  char fnam[100];
+  float *specs = (float *)malloc(sizeof(float)*63*384);
+  float mjd;
+  int ctr = 0;
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    cblock = (Complex *)(block);
+    
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+
+      if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) {
+	syslog(LOG_ERR,"could not open fmjd");
+      }
+      fscanf(fmjd,"%f",&mjd);
+      fclose(fmjd);
+      sprintf(fnam,"/home/ubuntu/data/specs_%f.dat",mjd);
+      
+    }
+
+    // DO STUFF - from block to summed_vis
+
+    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
+    simple_extract((Complex *)(block), data);
+    if (DEBUG) syslog(LOG_DEBUG,"extracted!");
+
+    // write to file if needed
+    if (output_specs==1) {
+
+      if (ctr==0) 
+	for (int i=0;i<63*384;i++) specs[i] = 0.;
+
+      auto_extract(data, specs);
+      ctr += 1;
+
+      if (ctr==N_INTS) {
+	fout = fopen(fnam,"a");
+	for (int i=0;i<63*384;i++) 
+	  fprintf(fout, "%f\n", specs[i]);
+	fclose(fout);
+	ctr=0;
+      }
+	
+    }
+    
+    
+    // write to output
+    written = ipcio_write (hdu_out->data_block, (char *)data, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);
+      for (int i=0;i<10;i++) {
+	syslog(LOG_INFO, "%g", data[i]);
+	printf("%g ", data[i]);
+	printf("\n");
+      }
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(data);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_wrangleAndWrite.c b/legacy/dsaX_wrangleAndWrite.c
new file mode 100644
index 0000000..6cd4a33
--- /dev/null
+++ b/legacy/dsaX_wrangleAndWrite.c
@@ -0,0 +1,365 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+#include "xgpu.h"
+
+// global variables
+int DEBUG = 0;
+const int n_all = 3194880;
+const int nbl = 2080;
+
+// for lookup table generation
+// index is position to extract from xgpu array to output (Greg-style) array
+void gen_lookup(int * idx_xgpu_in_greg);
+void gen_lookup(int * idx_xgpu_in_greg) {
+
+  // get antenna order in xgpu
+  int xgpu_ant_1[nbl], xgpu_ant_2[nbl], ct=0;
+  for (int i=0;i<64;i++) {
+    for (int j=0;j<=i;j++) {
+      xgpu_ant_1[ct] = j;
+      xgpu_ant_2[ct] = i;
+      ct++;
+    }
+  }
+
+  // get antenna order in Greg
+  int gh_ant_1[nbl], gh_ant_2[nbl];
+  ct=0;
+  for (int i=0;i<64;i++) {
+    for (int j=i;j<64;j++) {
+      gh_ant_1[ct] = i;
+      gh_ant_2[ct] = j;
+      ct++;
+    }
+  }
+
+  // match antenna orders
+  for (int i=0;i<nbl;i++) {
+
+    for (int j=0;j<nbl;j++) {
+      if (gh_ant_1[i]==xgpu_ant_1[j] && gh_ant_2[i]==xgpu_ant_2[j])
+	idx_xgpu_in_greg[i] = j;
+    }
+
+  }
+
+}
+
+
+// for reordering correlations
+void reorder_gh(float *input, float *output);
+void reorder_gh(float *input, float *output, int * idx_xgpu_in_greg) {
+
+  for (int i=0;i<nbl;i++) {
+    for (int j=0;j<384*2*2;j++) {
+
+      output[i*1536+j] = input[idx_xgpu_in_greg[i]*1536+j];
+
+    }
+  }
+    
+}
+
+// for extracting data
+// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
+void simple_extract(Complex *mat, float *output);
+
+void simple_extract(Complex *mat, float *output) {
+
+  int in_idx, out_idx;
+  for (int bctr=0;bctr<2080;bctr++) {
+    for (int pol1=0;pol1<2;pol1++) {
+
+      for (int f=0;f<384;f++) {
+
+	out_idx = 2*((bctr*384+f)*2+pol1);
+	in_idx = (2*f*2080+bctr)*4+pol1*3;
+	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
+	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
+
+      }
+    }
+  }
+
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = TEST_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block;
+  uint64_t written, block_id;
+  Complex * cblock;
+  float *data = (float *)malloc(sizeof(float)*n_all);
+  
+  
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    cblock = (Complex *)(block);
+    
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF - from block to summed_vis
+
+    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
+    simple_extract((Complex *)(block), data);
+    if (DEBUG) syslog(LOG_DEBUG,"extracted!");    
+
+    // write to output
+    written = ipcio_write (hdu_out->data_block, (char *)data, block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);
+      for (int i=0;i<10;i++) {
+	syslog(LOG_INFO, "%g", data[i]);
+	printf("%g ", data[i]);
+	printf("\n");
+      }
+    }
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(data);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dsaX_writeFil.c b/legacy/dsaX_writeFil.c
new file mode 100644
index 0000000..751db9d
--- /dev/null
+++ b/legacy/dsaX_writeFil.c
@@ -0,0 +1,486 @@
+/* This works pretty much like the trigger code. receives a control UDP message 
+to store some data for a fixed amount of time.
+Message format: length(s)-NAME
+Will ignore messages until data recording is over
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/syscall.h>
+#include <syslog.h>
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <src/sigproc.h>
+#include <src/header.h>
+
+
+FILE *output;
+
+void send_string(char *string) /* includefile */
+{
+  int len;
+  len=strlen(string);
+  fwrite(&len, sizeof(int), 1, output);
+  fwrite(string, sizeof(char), len, output);
+}
+
+void send_float(char *name,float floating_point) /* includefile */
+{
+  send_string(name);
+  fwrite(&floating_point,sizeof(float),1,output);
+}
+
+void send_double (char *name, double double_precision) /* includefile */
+{
+  send_string(name);
+  fwrite(&double_precision,sizeof(double),1,output);
+}
+
+void send_int(char *name, int integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(int),1,output);
+}
+
+void send_char(char *name, char integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(char),1,output);
+}
+
+
+void send_long(char *name, long integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(long),1,output);
+}
+
+void send_coords(double raj, double dej, double az, double za) /*includefile*/
+{
+  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
+  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
+  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
+  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
+}
+
+
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+int trignum = 0;
+int dumpnum = 0;
+char iP[100];
+char srcnam[1024];
+float reclen;
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+void convert_block(char * b1, char * b2);
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_image [options]\n"
+	   " -c core   bind process to CPU core\n"
+	   " -b write one beam\n"
+	   " -f filename base [default test.fil]\n"
+	   " -k in_key [BF_BLOCK_KEY]\n"
+	   " -i IP to listen to [no default]\n"
+	   " -s integrate N ints MUST BE FACTOR OF 16384 [default 1]\n"
+	   " -m get mjd from file\n"
+	   " -d DEBUG\n"
+	   " -h        print usage\n");
+}
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in) {
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+}
+
+// Thread to control the dumping of data
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = WRITEVIS_CONTROL_PORT;
+  char sport[10];
+  sprintf(sport,"%d",port);
+  
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  float tmp_reclen;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+    trignum++;
+
+    // interpret buffer string
+    char * rest = buffer;
+    tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr));
+    char * tmp_srcnam = strtok(NULL, "-");
+    
+    if (!dump_pending) {
+      reclen = tmp_reclen;
+      strcpy(srcnam,tmp_srcnam);
+      syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam);
+    }
+	
+    if (dump_pending)
+      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam);
+  
+    if (!dump_pending) dump_pending = 1;
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  if (ctx->verbose)
+    syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_writeFil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA defs */
+  dada_hdu_t* hdu_in = 0;
+  multilog_t* log = 0;
+  key_t in_key = BF_BLOCK_KEY;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // command line
+  int arg = 0;
+  int core = -1;
+  float fch1 = 1530.0;
+  char fnam[300], foutnam[400];
+  sprintf(fnam,"/home/dsa/alltest");
+
+  // for getting MJD
+  FILE *fmjd;
+  int get_mjd = 0;
+  int sumi=1;
+  int onebeam=0;
+  
+  while ((arg=getopt(argc,argv,"c:f:o:i:k:s:bmdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      printf ("ERROR: -c flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'k':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-k flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  strcpy(fnam,optarg);
+	  break;
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'd':
+	  DEBUG=1;
+	  break;
+	case 'b':
+	  onebeam=1;
+	  break;
+	case 'm':
+	  get_mjd=1;
+	  break;
+	case 's':
+	  sumi = atoi(optarg);
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // DADA stuff
+  
+  udpdb.verbose = 1;
+
+  syslog (LOG_INFO, "dsaX_writefil: creating hdu");
+
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"dsaX_writefil: could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"dsaX_writespec: could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      syslog(LOG_INFO,"binding to core %d", core);
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"dsaX_writefil: failed to bind to core %d", core);
+    }
+
+  int observation_complete=0;
+
+  // more DADA stuff - deal with headers
+  
+  uint64_t header_size = 0;
+
+  // read the headers from the input HDUs and mark as cleared
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "main: could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  syslog(LOG_INFO, "starting control_thread()");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+
+  // set up
+  int fctr = 0, integration = 0;
+  char tstamp[100];
+  double mjd=55000.;
+  int rownum = 1;
+  int dfwrite = 0;
+  float mytsamp = 4.*8.*8.192e-6;
+  int NINTS, midx;
+  
+  // data stuff
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t bytes_read = 0, block_id;
+  char *block;
+  float *hoblock = (float *)malloc(sizeof(float)*64*1024*16384/sumi);  
+  
+  // start things
+
+  syslog(LOG_INFO, "dsaX_writespec: starting observation");
+  int nblocks = 0;
+  
+  while (!observation_complete) {
+
+    // read block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    if (DEBUG) for (int i=0;i<48;i++) syslog(LOG_INFO,"%hu",((unsigned char *)(block))[i]);
+
+    for (int i=0;i<64*1024*16384/sumi;i++) hoblock[i] = 0.;
+    
+    // for writing sum
+    /*    for (int i=0;i<256*48;i++) oblock[i] = 0.;
+    for (int i=0;i<128;i++) {
+      for (int j=0;j<256*48;j++) oblock[j] += (float)(block[i*256*48+j]);
+      }*/
+    
+    syslog(LOG_INFO,"read block %d",nblocks);
+        
+    // check for dump_pending
+    if (dump_pending) {
+
+      // if file writing hasn't started
+      if (dfwrite==0) {
+
+	syslog(LOG_INFO, "beginning file write for SRC %s for %f s",srcnam,reclen);
+	
+	NINTS = (int)(floor(reclen/(mytsamp*16384.)));
+	//NINTS = (int)(floor(reclen/(0.134217728)));
+	sprintf(foutnam,"%s_%s_%d_%d.fil",fnam,srcnam,fctr,nblocks);
+	syslog(LOG_INFO, "main: opening new file %s",foutnam);
+
+	if (!(output = fopen(foutnam,"wb"))) {
+	  printf("Couldn't open output file\n");
+	  return 0;	  
+	}
+
+	if (get_mjd==1) {
+	  if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) {
+	    syslog(LOG_ERR,"could not open fmjd");
+	  }
+	  fscanf(fmjd,"%lf",&mjd);
+	  mjd += nblocks*4.294967296/86400.;
+	  fclose(fmjd);
+	}
+	  
+
+	send_string("HEADER_START");
+	send_string("source_name");
+	send_string(srcnam);
+	send_int("machine_id",1);
+	send_int("telescope_id",82);
+	send_int("data_type",1); // filterbank data
+	send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
+	send_double("foff",-0.244140625);
+	send_int("nchans",1024);
+	if (sumi==1) send_int("nbits",8);
+	else send_int("nbits",32);	
+	send_double("tstart",mjd);
+	send_double("tsamp",8.192e-6*8.*4.*sumi);
+	send_int("nifs",1);
+	send_string("HEADER_END");
+	
+	syslog(LOG_INFO, "main: opened new file %s",foutnam);
+		
+	dfwrite=1;
+
+	
+      }      
+      
+      // write data to file
+      syslog(LOG_INFO,"writing");
+
+      
+      for (int i=0;i<64;i++) {
+	for (int j=0;j<16384/sumi;j++) {
+	  for (int k=0;k<sumi;k++) {
+	    for (int l=0;l<1024;l++) {
+	      hoblock[i*16384*1024/sumi + j*1024 + l] += 1.*((unsigned char *)(block))[i*16384*1024 + (j*sumi+k)*1024 + l];
+	    }
+	  }
+	}
+      }
+	      
+      
+      if (sumi==1) fwrite((unsigned char *)(block),sizeof(unsigned char),block_size,output);
+      else {
+	if (onebeam==1) fwrite(hoblock + block_size/sumi/2,sizeof(float),block_size/sumi/64,output);
+	else fwrite(hoblock,sizeof(float),block_size/sumi,output);
+      }
+      //fwrite(oblock,sizeof(float),256*48,output);
+
+      integration++;
+      // check if file writing is done
+      if (integration==NINTS) {
+	fclose(output);
+	integration=0;
+	syslog(LOG_INFO, "dsaX_writespec: completed file %d",fctr);
+	fctr++;
+	dfwrite=0;
+	dump_pending=0;
+      }
+
+      syslog(LOG_INFO,"written");
+      
+    }
+            
+    // close off loop
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    nblocks += 1;
+    
+  }
+
+  // close control thread
+  syslog(LOG_INFO, "joining control_thread");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+
+  free(hoblock);
+  dsaX_dbgpu_cleanup(hdu_in);
+ 
+}
diff --git a/legacy/dsaX_writevis.c b/legacy/dsaX_writevis.c
new file mode 100644
index 0000000..02cebb7
--- /dev/null
+++ b/legacy/dsaX_writevis.c
@@ -0,0 +1,428 @@
+/* This works pretty much like the trigger code. receives a control UDP message 
+to store some data for a fixed amount of time.
+Message format: length(s)-NAME
+Will ignore messages until data recording is over
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/syscall.h>
+#include <syslog.h>
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+#include "fitsio.h"
+#include "xgpu.h"
+
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+int trignum = 0;
+int dumpnum = 0;
+char iP[100];
+char srcnam[1024];
+float reclen;
+int DEBUG = 0;
+
+// assumes that only first 78 baselines are written and 384 channels and 2 pols
+const int n = 9216;
+float summed_vis[9216];
+const int n_all = 3194880;
+
+// for extracting data
+// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
+void simple_extract(Complex *mat, float *output);
+
+void simple_extract(Complex *mat, float *output) {
+
+  int in_idx, out_idx;
+  for (int bctr=0;bctr<2080;bctr++) {
+    for (int pol1=0;pol1<2;pol1++) {
+
+      for (int f=0;f<384;f++) {
+
+	out_idx = 2*((bctr*384+f)*2+pol1);
+	in_idx = (2*f*2080+bctr)*4+pol1*3;
+	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
+	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
+
+      }
+    }
+  }
+
+}
+
+
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_image [options]\n"
+	   " -c core   bind process to CPU core\n"
+	   " -d debug [default no]\n"
+	   " -k in_key [default XGPU_BLOCK_KEY]\n"
+	   " -f filename base [default test.fits]\n"
+	   " -o freq of chan 1 [default 1494.84375]\n"
+	   " -i IP to listen to [no default]\n"
+	   " -h        print usage\n");
+}
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in) {
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+}
+
+// Thread to control the dumping of data
+
+void control_thread (void * arg) {
+
+  udpdb_t * ctx = (udpdb_t *) arg;
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // port on which to listen for control commands
+  int port = WRITEVIS_CONTROL_PORT;
+  char sport[10];
+  sprintf(sport,"%d",port);
+  
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,sport,&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  float tmp_reclen;
+  
+  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+    trignum++;
+
+    // interpret buffer string
+    char * rest = buffer;
+    tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr));
+    char * tmp_srcnam = strtok(NULL, "-");
+    
+    if (!dump_pending) {
+      reclen = tmp_reclen;
+      strcpy(srcnam,tmp_srcnam);
+      syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam);
+    }
+	
+    if (dump_pending)
+      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam);
+  
+    if (!dump_pending) dump_pending = 1;
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+
+  if (ctx->verbose)
+    syslog(LOG_INFO, "control_thread: exiting");
+
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+
+}
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_writevis", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA defs */
+  dada_hdu_t* hdu_in = 0;
+  multilog_t* log = 0;
+  key_t in_key = XGPU_BLOCK_KEY;
+
+  /* actual struct with info */
+  udpdb_t udpdb;
+  
+  // command line
+  int arg = 0;
+  int core = -1;
+  float fch1 = 1500.0;
+  int nchans = 384;
+  char fnam[300], foutnam[400];
+  sprintf(fnam,"/home/ubuntu/alltest");
+  
+  while ((arg=getopt(argc,argv,"c:f:o:i:k:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      printf ("ERROR: -c flag requires argument\n");
+	      return EXIT_FAILURE;
+	    }
+	case 'k':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-k flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  strcpy(fnam,optarg);
+	  break;
+	case 'd':
+	  DEBUG=1;
+	  break;
+	case 'o':
+	  fch1 = atof(optarg);
+	  break;
+	case 'i':
+	  strcpy(iP,optarg);
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // DADA stuff
+  
+  udpdb.verbose = 1;
+
+  syslog (LOG_INFO, "dsaX_writevis: creating hdu");
+
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"dsaX_writevis: could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"dsaX_writevis: could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      syslog(LOG_INFO,"binding to core %d", core);
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"dsaX_writevis: failed to bind to core %d", core);
+    }
+
+  int observation_complete=0;
+
+  // more DADA stuff - deal with headers
+  
+  uint64_t header_size = 0;
+
+  // read the headers from the input HDUs and mark as cleared
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "main: could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+
+  // start control thread
+  int rval = 0;
+  pthread_t control_thread_id;
+  syslog(LOG_INFO, "starting control_thread()");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+  }
+
+  // set up
+  int fctr = 0, integration = 0;
+  fitsfile *fptr;
+  int rownum = 1;
+  int fwrite = 0;
+  int status=0;
+  float mytsamp = 4096*4*8.192e-6;
+  int NINTS;
+  
+  // data stuff
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t bytes_read = 0, block_id;
+  char *block;
+  float *data = (float *)malloc(sizeof(float)*n_all);
+  int si1, si2;
+  int nblocks = 0;
+  Complex * cblock; 
+  
+  // start things
+
+  syslog(LOG_INFO, "dsaX_writevis: starting observation");
+
+  while (!observation_complete) {
+
+    // read block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    cblock = (Complex *)(block);
+
+    if (DEBUG) {
+      if (nblocks==20) {
+	for (int i=100;i<200;i++) {
+	  syslog(LOG_DEBUG,"MAT %d %f %f",i,(float)(cblock[i].real),(float)(cblock[i].imag));
+	}
+      }
+    }
+    
+    // DO STUFF - from block to summed_vis
+
+    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
+    simple_extract((Complex *)(block), data);
+    for (int i=0;i<n;i++) summed_vis[i] = data[i];
+    if (DEBUG) syslog(LOG_DEBUG,"extracted!");
+    
+    // check for dump_pending
+    if (dump_pending) {
+
+      // if file writing hasn't started
+      if (fwrite==0) {
+
+	syslog(LOG_INFO, "dsaX_writevis: beginning file write for SRC %s for %f s",srcnam,reclen);
+	status=0;
+	
+	NINTS = (int)(floor(reclen/mytsamp));
+	sprintf(foutnam,"%s_%s_%d.fits",fnam,srcnam,fctr);
+	syslog(LOG_INFO, "main: opening new file %s",foutnam);
+	rownum=1;
+	
+	char *ttype[] = {"VIS"};
+	char *tform[] = {"9216E"}; // assumes classic npts
+	char *tunit[] = {"\0"};
+	char *wsrcnam = srcnam;
+	
+	char extname[] = "DATA";
+	fits_create_file(&fptr, foutnam, &status);
+	if (status) syslog(LOG_ERR, "create_file FITS error %d",status);
+	fits_create_tbl(fptr, BINARY_TBL, 0, 1, ttype, tform, tunit, extname, &status);
+	fits_write_key(fptr, TFLOAT, "TSAMP", &mytsamp, "Sample time (s)", &status);
+	fits_write_key(fptr, TFLOAT, "FCH1", &fch1, "Frequency (MHz)", &status);
+	fits_write_key(fptr, TINT, "NCHAN", &nchans, "Channels", &status);
+	fits_write_key(fptr, TSTRING, "Source", &wsrcnam[0], "Source", &status);	  
+	fits_write_key(fptr, TINT, "NBLOCKS", &nblocks, "Ints", &status);
+	if (status) syslog(LOG_ERR, "fits_write FITS error %d",status);
+	fits_close_file(fptr, &status);
+
+	fwrite=1;
+	
+      }
+
+      // write data to file
+      fits_open_table(&fptr, foutnam, READWRITE, &status);
+      fits_write_col(fptr, TFLOAT, 1, rownum, 1, n, summed_vis, &status);
+      rownum += 1;
+      fits_update_key(fptr, TINT, "NAXIS2", &rownum, "", &status);
+      fits_close_file(fptr, &status);
+      integration++;
+      if (status) syslog(LOG_ERR, "fits_write FITS error %d",status);	
+      // check if file writing is done
+      if (integration==NINTS) {
+	integration=0;
+	syslog(LOG_INFO, "dsaX_writevis: completed file %d",fctr);
+	fctr++;
+	fwrite=0;
+	dump_pending=0;
+      }
+
+      syslog(LOG_INFO,"written");
+      
+    }
+            
+    // close off loop
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    nblocks++;
+
+    if (DEBUG) syslog(LOG_DEBUG,"Finished block %d",nblocks);
+    
+  }
+
+  // close control thread
+  syslog(LOG_INFO, "joining control_thread");
+  quit_threads = 1;
+  void* result=0;
+  pthread_join (control_thread_id, &result);
+
+  free(data);
+  dsaX_dbgpu_cleanup(hdu_in);
+ 
+}
diff --git a/legacy/dsaX_xgpu.cu b/legacy/dsaX_xgpu.cu
new file mode 100644
index 0000000..d065848
--- /dev/null
+++ b/legacy/dsaX_xgpu.cu
@@ -0,0 +1,375 @@
+// -*- c++ -*-
+/* will run xgpu */
+/* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/scatter.h>
+
+//#include "dada_cuda.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include "cube/cube.h"
+#include "xgpu.h"
+
+/* global variables */
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+
+} 
+
+// kernel for fluffing
+// run with 6291456 blocks of 32 threads
+__global__ void promoter(char *input, char *output) {
+
+  int idx = blockIdx.x*32 + threadIdx.x;
+  char v = input[idx];
+  
+  //output[2*idx] = ((v<<4) & 240) >> 4;
+  //output[2*idx+1] = v >> 4;
+  output[2*idx] = (char)(((unsigned char)(v) & (unsigned char)(15)) << 4) >> 4;
+  output[2*idx+1] = (char)(((unsigned char)(v) & (unsigned char)(240))) >> 4;
+  
+}
+
+void usage()
+{
+fprintf (stdout,
+	 "dsaX_xgpu [options]\n"
+	 " -c core   bind process to CPU core [no default]\n"
+	 " -d send debug messages to syslog\n"
+	 " -i in_key [default REORDER_BLOCK_KEY]\n"
+	 " -o out_key [default XGPU_BLOCK_KEY]\n"
+	 " -h print usage\n");
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_xgpu", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY;
+  key_t out_key = XGPU_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }  
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out);  
+  uint64_t  bytes_read = 0;
+  char * block;
+  char * output_buffer;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  uint64_t written, block_id;  
+
+  
+  // set up xgpu
+
+  // register input hdu with gpu
+  //dada_cuda_dbregister(hdu_in);
+
+  // structures and definitions
+  XGPUInfo xgpu_info;
+  int syncOp = SYNCOP_DUMP;
+  int xgpu_error = 0;
+  xgpuInfo(&xgpu_info);
+  XGPUContext context;
+  context.array_h = NULL;
+  context.matrix_h = NULL;
+  xgpu_error = xgpuInit(&context, 0);
+  if(xgpu_error) {
+    syslog(LOG_ERR, "xGPU error %d", xgpu_error);
+    dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+    return EXIT_FAILURE;
+  }
+  ComplexInput *array_h = context.array_h; // this is pinned memory
+  Complex *cuda_matrix_h = context.matrix_h;
+  memset((char *)array_h,0,2*context.array_len);
+
+  syslog(LOG_INFO,"Set up xgpu with input size %d output size %d",context.array_len,context.matrix_len);
+
+  // set up data input for fluffing
+  char * h_din = (char *)malloc(sizeof(char)*context.array_len);
+  char *d_din, *d_dout;
+  cudaMalloc((void **)&d_din, context.array_len*sizeof(char));
+  cudaMalloc((void **)&d_dout, 2*context.array_len*sizeof(char)); 
+
+  // do prestart
+  syslog(LOG_INFO, "pre-starting...");
+  char * tmp_data = (char *)malloc(sizeof(char)*context.array_len);
+  memset(tmp_data, 1, context.array_len);
+  for (int i=0;i<10;i++) {
+
+    cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice);
+    promoter<<<6291456,32>>>(d_din,d_dout);
+    //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+    xgpu_error = xgpuCudaXengine(&context, syncOp);
+    xgpuClearDeviceIntegrationBuffer(&context);
+
+  }
+
+  free(tmp_data);
+  syslog(LOG_INFO, "finished with pre-start");
+  
+  // get things started
+  bool observation_complete=0;
+  bool started = 0;
+  syslog(LOG_INFO, "starting observation");
+  int blocks = 0;
+  
+  while (!observation_complete) {
+
+    if (DEBUG) syslog(LOG_DEBUG,"reading block");    
+    
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+      
+    // DO STUFF
+
+    for (int myint=0;myint<NPACKETS/NPACKETS_INTS;myint++) {
+    
+      // do fluff
+      cudaMemcpy(d_din,block+myint*block_size*NPACKETS_INTS/NPACKETS,context.array_len*sizeof(char),cudaMemcpyHostToDevice);
+      promoter<<<6291456,32>>>(d_din,d_dout);
+      //cudaMemcpy((char *)(array_h),d_dout,2*context.array_len*sizeof(char),cudaMemcpyDeviceToHost);        
+      cudaDeviceSynchronize();
+    
+      // run xgpu
+      //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+      xgpu_error = xgpuCudaXengine(&context, syncOp);
+      if(xgpu_error) {
+	syslog(LOG_ERR, "xGPU error %d\n", xgpu_error);
+	return EXIT_FAILURE;
+      }
+      
+      if (started==0 && blocks==20) {
+	syslog(LOG_INFO,"now in RUN state");
+	if (DEBUG) {
+	  for (int i=100;i<200;i++) {
+	    syslog(LOG_DEBUG,"INPUT %hhi %hhi",array_h[i].real,array_h[i].imag);
+	    syslog(LOG_DEBUG,"OUTPUT %g %g",(float)(cuda_matrix_h[i].real),(float)(cuda_matrix_h[i].imag));
+	  }
+	}
+	started=1;
+      }    
+      
+      // clear device
+      xgpuClearDeviceIntegrationBuffer(&context);
+      
+      // write to output
+      
+      written = ipcio_write (hdu_out->data_block, (char *)(cuda_matrix_h), block_out);
+      if (written < block_out)
+	{
+	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	  return EXIT_FAILURE;
+	}
+
+      if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);	    
+      blocks++;
+
+    }
+      
+    // finish up
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+  }
+
+  // finish up
+  free(output_buffer);
+  free(h_din);
+  cudaFree(d_din);
+  cudaFree(d_dout);
+  //dada_cuda_dbunregister(hdu_in);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
+
+
diff --git a/legacy/dumpfil.c b/legacy/dumpfil.c
new file mode 100644
index 0000000..0be913c
--- /dev/null
+++ b/legacy/dumpfil.c
@@ -0,0 +1,294 @@
+//E_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+
+// global variables
+int DEBUG = 0;
+
+void usage()
+{
+  fprintf (stdout,
+	   "dumpfil [options]\n"
+	   " -d send debug messages to syslog\n"
+	   " -p no header\n"
+	   " -f file to dump to [default none]\n"
+	   " -n blocks to dump [default 30]\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -g ignore first block\n"
+	   " -h print usage\n");
+}
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in);
+
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+  
+}
+
+FILE *output;
+
+void send_string(char *string) /* includefile */
+{
+  int len;
+  len=strlen(string);
+  fwrite(&len, sizeof(int), 1, output);
+  fwrite(string, sizeof(char), len, output);
+}
+
+void send_float(char *name,float floating_point) /* includefile */
+{
+  send_string(name);
+  fwrite(&floating_point,sizeof(float),1,output);
+}
+
+void send_double (char *name, double double_precision) /* includefile */
+{
+  send_string(name);
+  fwrite(&double_precision,sizeof(double),1,output);
+}
+
+void send_int(char *name, int integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(int),1,output);
+}
+
+void send_char(char *name, char integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(char),1,output);
+}
+
+
+void send_long(char *name, long integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(long),1,output);
+}
+
+void send_coords(double raj, double dej, double az, double za) /*includefile*/
+{
+  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
+  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
+  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
+  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
+}
+
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dumpfil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+
+  // data block HDU keys
+  key_t in_key = 0x0000aaae;
+  
+  // command line arguments
+  char fnam[100];
+  sprintf(fnam,"/home/ubuntu/dumpfil.fil");
+  int nbl = 30;
+  int arg = 0;
+  int nhd = 0;
+  int igblock = 0;
+  
+  while ((arg=getopt(argc,argv,"f:i:n:pdgh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'n':
+	  if (optarg)
+	    {
+	      nbl = atoi(optarg);	      
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-n flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'p':
+	  nhd=1;
+	  syslog (LOG_INFO, "Will not write a header");
+	  break;
+	case 'g':
+	  igblock=1;
+	  syslog (LOG_INFO, "Will ignore first block");
+	  break;
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  syslog(LOG_INFO,"will use %d blocks",nbl);
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in);
+      return EXIT_FAILURE;
+    }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  syslog(LOG_INFO, "main: have input block size %lu\n",block_size);
+  uint64_t  bytes_read = 0;
+  uint64_t npackets = 1;
+  char * block, * output_buffer;
+  uint64_t written, block_id;
+
+  // fill output buffer if file exists
+  output=fopen(fnam,"wb");
+  if(output == NULL)
+    {
+      syslog(LOG_ERR,"Error opening file");
+      exit(1);
+    }
+
+  if (!nhd) {
+    send_string("HEADER_START");
+    send_string("source_name");
+    send_string("TESTSRC");
+    send_int("machine_id",1);
+    send_int("telescope_id",82);
+    send_int("data_type",1); // filterbank data
+    send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
+    send_double("foff",-0.244140625);
+    send_int("nchans",1024);
+    send_int("nbits",8);
+    send_double("tstart",55000.0);
+    send_double("tsamp",8.192e-6*8.*16.);
+    send_int("nifs",1);
+    send_string("HEADER_END");
+  }
+  
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+
+
+  while (blocks < nbl) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (!igblock || started!=0) {
+      fwrite(block, sizeof(char), bytes_read, output);
+      blocks++;
+    }
+
+    if (started==0) started=1;
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+  }
+
+  fclose(output);
+  dsaX_dbgpu_cleanup (hdu_in);
+  
+}
diff --git a/legacy/fil2dada.c b/legacy/fil2dada.c
new file mode 100644
index 0000000..c49f2b5
--- /dev/null
+++ b/legacy/fil2dada.c
@@ -0,0 +1,521 @@
+//E_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+//#include "ascii_header.h"
+//#include "dsaX_capture.h"
+//#include "dsaX_def.h"
+
+// global variables
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+int dada_bind_thread_to_core (int core);
+
+/* read fil file header variables */
+char rawdatafile[80], source_name[80];
+int machine_id, telescope_id, data_type, nchans, nbits, nifs, scan_number,
+  barycentric,pulsarcentric; /* these two added Aug 20, 2004 DRL */
+double tstart,mjdobs,tsamp,fch1,foff,refdm,az_start,za_start,src_raj,src_dej;
+double gal_l,gal_b,header_tobs,raw_fch1,raw_foff;
+int nbeams, ibeam;
+/* added 20 December 2000    JMC */
+double srcl,srcb;
+double ast0, lst0;
+long wapp_scan_number;
+char project[8];
+char culprits[24];
+double analog_power[2];
+/* added frequency table for use with non-contiguous data */
+double frequency_table[4096]; /* note limited number of channels */
+long int npuls; /* added for binary pulse profile format */
+
+
+int nbins;
+double period;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+{
+
+  if (dada_hdu_unlock_read (in) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock read on hdu_in");
+    }
+  dada_hdu_destroy (in);
+
+  if (dada_hdu_unlock_write (out) < 0)
+    {
+      syslog(LOG_ERR, "could not unlock write on hdu_out");
+    }
+  dada_hdu_destroy (out);
+  
+}
+
+/*
+void get_string(FILE *inputfile, int *nbytes, char string[])
+{
+  int nchar;
+  size_t nRead;
+  strcpy(string,"ERROR");
+  nRead = fread(&nchar, sizeof(int), 1, inputfile);
+  if (feof(inputfile)) exit(0);
+  if (nchar>80 || nchar<1) return;
+  *nbytes=sizeof(int);
+  nRead = fread(string, nchar, 1, inputfile);
+  string[nchar]='\0';
+  *nbytes+=nchar;
+}
+*/
+
+int read_header(FILE *inputfile);
+/*
+int read_header(FILE *inputfile)
+{
+  size_t nRead;
+  char string[80], message[80];
+  int itmp,nbytes,totalbytes,expecting_rawdatafile=0,expecting_source_name=0; 
+  int expecting_frequency_table=0,channel_index;
+
+
+
+  get_string(inputfile,&nbytes,string);
+  if (!strcmp(string,"HEADER_START")) 
+	rewind(inputfile);
+	return 0;
+  }
+  totalbytes=nbytes;
+
+  while (1) {
+    get_string(inputfile,&nbytes,string);
+    if (strcmp(string,"HEADER_END")) break;
+    totalbytes+=nbytes;
+    if (strcmp(string,"rawdatafile")) {
+      expecting_rawdatafile=1;
+    } else if (strcmp(string,"source_name")) {
+      expecting_source_name=1;
+    } else if (strcmp(string,"FREQUENCY_START")) {
+      expecting_frequency_table=1;
+      channel_index=0;
+    } else if (strcmp(string,"FREQUENCY_END")) {
+      expecting_frequency_table=0;
+    } else if (strcmp(string,"az_start")) {
+      nRead = fread(&az_start,sizeof(az_start),1,inputfile);
+      totalbytes+=sizeof(az_start);
+    } else if (strcmp(string,"za_start")) {
+      nRead = fread(&za_start,sizeof(za_start),1,inputfile);
+      totalbytes+=sizeof(za_start);
+    } else if (strcmp(string,"src_raj")) {
+      nRead = fread(&src_raj,sizeof(src_raj),1,inputfile);
+      totalbytes+=sizeof(src_raj);
+    } else if (strcmp(string,"src_dej")) {
+      nRead = fread(&src_dej,sizeof(src_dej),1,inputfile);
+      totalbytes+=sizeof(src_dej);
+    } else if (strcmp(string,"tstart")) {
+      nRead = fread(&tstart,sizeof(tstart),1,inputfile);
+      totalbytes+=sizeof(tstart);
+    } else if (strcmp(string,"tsamp")) {
+      nRead = fread(&tsamp,sizeof(tsamp),1,inputfile);
+      totalbytes+=sizeof(tsamp);
+    } else if (strcmp(string,"period")) {
+      nRead = fread(&period,sizeof(period),1,inputfile);
+      totalbytes+=sizeof(period);
+    } else if (strcmp(string,"fch1")) {
+      nRead = fread(&fch1,sizeof(fch1),1,inputfile);
+      totalbytes+=sizeof(fch1);
+    } else if (strcmp(string,"fchannel")) {
+      nRead = fread(&frequency_table[channel_index++],sizeof(double),1,inputfile);
+      totalbytes+=sizeof(double);
+      fch1=foff=0.0;
+    } else if (strcmp(string,"foff")) {
+      nRead = fread(&foff,sizeof(foff),1,inputfile);
+      totalbytes+=sizeof(foff);
+    } else if (strcmp(string,"nchans")) {
+      nRead = fread(&nchans,sizeof(nchans),1,inputfile);
+      totalbytes+=sizeof(nchans);
+    } else if (strcmp(string,"telescope_id")) {
+      nRead = fread(&telescope_id,sizeof(telescope_id),1,inputfile);
+      totalbytes+=sizeof(telescope_id);
+    } else if (strcmp(string,"machine_id")) {
+      nRead = fread(&machine_id,sizeof(machine_id),1,inputfile);
+      totalbytes+=sizeof(machine_id);
+    } else if (strcmp(string,"data_type")) {
+      nRead = fread(&data_type,sizeof(data_type),1,inputfile);
+      totalbytes+=sizeof(data_type);
+    } else if (strcmp(string,"ibeam")) {
+      nRead = fread(&ibeam,sizeof(ibeam),1,inputfile);
+      totalbytes+=sizeof(ibeam);
+    } else if (strcmp(string,"nbeams")) {
+      nRead = fread(&nbeams,sizeof(nbeams),1,inputfile);
+      totalbytes+=sizeof(nbeams);
+    } else if (strcmp(string,"nbits")) {
+      nRead = fread(&nbits,sizeof(nbits),1,inputfile);
+      totalbytes+=sizeof(nbits);
+    } else if (strcmp(string,"barycentric")) {
+      nRead = fread(&barycentric,sizeof(barycentric),1,inputfile);
+      totalbytes+=sizeof(barycentric);
+    } else if (strcmp(string,"pulsarcentric")) {
+      nRead = fread(&pulsarcentric,sizeof(pulsarcentric),1,inputfile);
+      totalbytes+=sizeof(pulsarcentric);
+    } else if (strcmp(string,"nbins")) {
+      nRead = fread(&nbins,sizeof(nbins),1,inputfile);
+      totalbytes+=sizeof(nbins);
+    } else if (strcmp(string,"nsamples")) {
+      nRead = fread(&itmp,sizeof(itmp),1,inputfile);
+      totalbytes+=sizeof(itmp);
+    } else if (strcmp(string,"nifs")) {
+      nRead = fread(&nifs,sizeof(nifs),1,inputfile);
+      totalbytes+=sizeof(nifs);
+    } else if (strcmp(string,"npuls")) {
+      nRead = fread(&npuls,sizeof(npuls),1,inputfile);
+      totalbytes+=sizeof(npuls);
+    } else if (strcmp(string,"refdm")) {
+      nRead = fread(&refdm,sizeof(refdm),1,inputfile);
+      totalbytes+=sizeof(refdm);
+    } else if (expecting_rawdatafile) {
+      strcpy(rawdatafile,string);
+      expecting_rawdatafile=0;
+    } else if (expecting_source_name) {
+      strcpy(source_name,string);
+      expecting_source_name=0;
+    } else {
+      sprintf(message,"read_header - unknown parameter: %s\n",string);
+      fprintf(stderr,"ERROR: %s\n",message);
+      exit(1);
+    } 
+  } 
+
+
+  totalbytes+=nbytes;
+
+  return totalbytes;
+}
+*/
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_fake [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -f file to read packet from [default none]\n"
+	   " -i in_key [default TEST_BLOCK_KEY]\n"
+	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
+	   " -n will not read header\n"
+	   " -b number of blocks to stop after\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = 0x0000dada;
+  key_t out_key = 0x0000caca;
+  
+  // command line arguments
+  int core = -1;
+  int useZ = 1;
+  char fnam[100];
+  int arg = 0;
+  int rhead = 1;
+  int nblocks = -1;
+  
+  while ((arg=getopt(argc,argv,"c:f:i:o:nb:dh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      useZ = 0;
+	      strcpy(fnam,optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'b':
+	  if (optarg)
+	    {
+	      nblocks = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-b flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'n':
+	  rhead=0;
+	  syslog (LOG_INFO, "Will not read header");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  uint64_t npackets = 1;
+  char * block, * output_buffer;
+  char * packet;
+  packet = (char *)malloc(sizeof(char)*block_size);
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,0,block_out);
+  uint64_t written, block_id;
+
+  // fill output buffer if file exists
+  FILE *fin;
+  if (!useZ) {
+
+    if (!(fin=fopen(fnam,"rb"))) {
+      syslog(LOG_ERR, "cannot open file - will write zeros");
+    }
+    else {
+
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
+      
+      //		fread(packet,block_out,1,fin);
+      //		fclose(fin);
+      
+      //		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      
+      //      for (int i=0;i<npackets;i++)
+      //		memcpy(output_buffer,packet,block_out);
+      
+      //		syslog(LOG_INFO, "Using input packet");
+      
+    }
+
+    
+  }
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0, started = 0;
+  
+  syslog(LOG_INFO, "starting observation");
+  
+  /*if (!(feof(fin)) {
+    fread()
+	}
+	else {
+		close and reopen file
+	}
+*/
+
+  while (!observation_complete) {
+    if (!(feof(fin))) {
+      fread(packet,block_out,1,fin);
+    }
+    else{
+      fclose(fin);
+      fin=fopen(fnam,"rb");
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
+      fread(packet,block_out,1,fin);
+    }
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+    // no need to do anything here - output_buffer is ready to go
+
+	// fread goes here
+	// count blocks, increment, stop loop and reopen file (or rewind)
+
+    // write to output
+    written = ipcio_write (hdu_out->data_block, packet, block_out);
+    if (written < block_out)
+      {
+		syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+		dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+		return EXIT_FAILURE;
+      }
+
+    if (DEBUG) {
+      syslog(LOG_DEBUG, "written block %d",blocks);      
+    }
+    blocks++;
+
+    if (blocks==nblocks)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  fclose(fin);
+  free(packet);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+}
diff --git a/legacy/flagger.c b/legacy/flagger.c
new file mode 100644
index 0000000..5262015
--- /dev/null
+++ b/legacy/flagger.c
@@ -0,0 +1,484 @@
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+
+#define NTIMES_P 4096	// # of time samples (assuming 1ms sampling period)
+#define NCHAN_P 1024	// # of channels on BF node side
+#define NBEAMS_P 64	// # of beams on BF side
+#define M_P NTIMES_P
+#define N_P 32
+#define HDR_SIZE 4096
+#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet
+
+// global variables
+int DEBUG = 0;
+double skarray[NBEAMS_P*NCHAN_P+1];	// array with SK values -- size NCHANS * NBEAMS
+double avgspec[NBEAMS_P*NCHAN_P+1];	// spectrum over all beams to estimate median filter
+double baselinecorrec[NBEAMS_P*NCHAN_P+1];	// spectrum over all beams to estimate median filter
+int cores[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25};
+
+void swap(char *p,char *q) {
+   char t;
+   
+   t=*p; 
+   *p=*q; 
+   *q=t;
+}
+
+double medval(double a[],int n) { 
+	int i,j;
+	char tmp[n];
+	for (i = 0;i < n;i++)
+		tmp[i] = a[i];
+	
+	for(i = 0;i < n-1;i++) {
+		for(j = 0;j < n-i-1;j++) {
+			if(tmp[j] > tmp[j+1])
+				swap(&tmp[j],&tmp[j+1]);
+		}
+	}
+	return tmp[(n+1)/2-1];
+}
+
+/* THREAD FUNCTION */
+
+struct data {
+	unsigned char * indata;
+	double * inSK;
+  unsigned char * output;
+  int cnt;
+	double nThreshUp;
+	int n_threads;
+	int thread_id;
+	int debug;
+};
+
+void noise_inject(void *args) {
+	
+	struct data *d = args;
+	int thread_id = d->thread_id;
+	int dbg = d->debug;
+	// set affinity
+	const pthread_t pid = pthread_self();
+	const int core_id = cores[thread_id];
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(core_id, &cpuset);
+	const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+	if (set_result != 0)
+		syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+	const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+	if (get_affinity != 0) 
+		syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+	if (CPU_ISSET(core_id, &cpuset))
+	  if (dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+	
+	
+	// noise injection
+	
+	unsigned char *indata = (unsigned char *)d->indata;
+	double *inSK = (double *)d->inSK;
+	unsigned char *output = (unsigned char *)d->output;
+	int * cnt = (int *)d->cnt;
+	double nThreshUp = (double)d->nThreshUp;
+	int nthreads = d->n_threads;
+	int i, j, k;
+	
+	// copy from input to output
+	//memcpy(output,indata,(NBEAMS_P/nthreads)*NTIMES_P*NCHAN_P);
+	
+	//cnt[thread_id] = 0;
+	
+	for (i = 0; i < (int)(NBEAMS_P/nthreads); i++){
+	  for (k = 0; k < NCHAN_P; k++){
+	    if (inSK[i*(int)(NCHAN_P) + k] > nThreshUp){
+	      cnt[thread_id]++;
+	      //if (dbg) syslog(LOG_DEBUG,"thread %d: flagging %d %d: sk %g",thread_id,i,k,inSK[i*(int)(NCHAN_P) + k]);
+	      //for (j = 0; j < NTIMES_P; j++){
+		//output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.);
+		//indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * 1. / ( (double)RAND_MAX ) + 10.);
+	      //}
+
+	      // copy from lookup table
+	      for (j = 0; j < NTIMES_P; j++)
+		indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = output[k*NTIMES_P+j];
+	      
+	    }
+	    /*else{
+	      for (j = 0; j < NTIMES_P; j++){
+	      output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k];
+	      }
+	      }*/
+	  }
+	}
+	
+	
+	
+	if (dbg) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id);
+	int thread_result = 0;
+	pthread_exit((void *) &thread_result);
+}
+
+/* END THREAD FUNCTION */
+
+void usage()
+{
+  fprintf (stdout,
+	   "flagger [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default dada]\n"
+	   " -o out_key [default caca]\n"
+	   " -n use noise generation rather than zeros\n"
+	   " -t SK threshold [default 5.0]\n"
+	   " -b compute and apply baseline correction\n"
+	   " -h print usage\n");
+}
+
+
+int main(int argc, char**argv)
+{
+
+  // syslog start
+  openlog ("flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  // threads initialization
+  int nthreads = 16;
+  pthread_t threads[nthreads];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  // read command line args
+
+  // data block HDU keys
+  key_t in_key = 0x0000dada;
+  key_t out_key = 0x0000caca;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int noise = 0;
+  double skthresh = 5.0;
+  int bcorr = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:bndh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      skthresh = atof(optarg);
+	      syslog(LOG_INFO,"modified SKTHRESH to %g",skthresh);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'n':
+	  noise=1;
+	  syslog (LOG_INFO, "Will generate noise samples");
+	  break;	  
+	case 'b':
+	  bcorr=1;
+	  syslog (LOG_INFO, "Will calculate and apply baseline correction");
+	  break;	  
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+  
+  
+  // CONNECT AND READ FROM BUFFER
+
+  dada_hdu_t* hdu_in = 0;	// header and data unit
+  uint64_t blocksize = NTIMES_P*NCHAN_P*NBEAMS_P;	// size of buffer
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to input buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to input buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+  // read the header from the input HDU
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  
+  // mark the input header as cleared
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){
+    syslog (LOG_ERR,"could not mark header as cleared");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t block_id, bytes_read = 0;
+  unsigned char *in_data;
+  char *cin_data;
+	     	
+  // OUTPUT BUFFER
+  dada_hdu_t* hdu_out = 0;
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"flagged_data: could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write (hdu_out) < 0) {
+    syslog (LOG_ERR,"flagged_data: could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+	
+  /* //read fake header for now
+	char head_dada[4096];
+	FILE *f = fopen("/home/dsa/dsa110-xengine/src/correlator_header_dsaX.txt", "rb");
+	fread(head_dada, sizeof(char), 4096, f);
+	fclose(f); */
+  
+  //// OUTPUT BUFFER
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  header_size = HDR_SIZE;
+  if (!header_out)
+    {
+      syslog(LOG_ERR,"couldn't read header_out");
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      return EXIT_FAILURE;
+    }
+  uint64_t written=0;
+  
+  ////////////////		
+	
+  double S1 = 0;
+  double S2 = 0;
+  double sampval;
+  double nThreshUp = skthresh;	// Threshold to apply to SK (empirical estimation)
+  struct data args[16];
+  int * flag_counts = (int *)malloc(sizeof(int)*nthreads);
+  //unsigned char * output = (unsigned char *)malloc(sizeof(char)*NBEAMS_P*NCHAN_P*NTIMES_P);
+  int nFiltSize = 21;
+  int cnt = 0;
+
+  // make array of random numbers
+  unsigned char * lookup_rand = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P);
+  for (int i=0;i<NTIMES_P*NCHAN_P;i++) 
+    lookup_rand[i] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.);
+  
+  // put rest of the code inside while loop
+  while (1) {	
+    
+    // read a DADA block
+    cin_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+    in_data = (unsigned char *)(cin_data);
+    
+    // compute SK and averaged spectrum
+    S1 = 0;
+    S2 = 0;
+    sampval = 0;
+		
+    for (int i = 0; i < NBEAMS_P; i++){
+      for (int k = 0; k < NCHAN_P; k++){
+	for (int j = 0; j < NTIMES_P; j++){
+	  sampval = (double)in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k];
+	  avgspec[i*(int)(NCHAN_P) + k] += sampval / NTIMES_P;
+	  S1 += sampval;
+	  S2 += sampval * sampval;
+	  skarray[i*(int)(NCHAN_P) + k] = (double)((M_P*N_P+1) / (M_P-1) * ( (M_P*S2)/(S1*S1) - 1 ));
+	}
+	S1 = 0;
+	S2 = 0;
+      }
+    }
+    if (DEBUG) syslog (LOG_DEBUG,"has computed SK.");
+    if (DEBUG) syslog(LOG_DEBUG,"example SK value : %g", (double)skarray[10]);
+		
+    // compute baseline correction
+    if (bcorr) {
+      for (int i = 0; i < NBEAMS_P*NCHAN_P-nFiltSize; i++)
+	baselinecorrec[i] = medval(&avgspec[i],nFiltSize);
+    }
+    		
+    
+    // compare SK values to threshold and
+    // replace thresholded channels with noise or 0
+    
+    if (noise){
+
+      for (int i=0;i<nthreads;i++) flag_counts[i] = 0;
+      for (int i=0; i<nthreads; i++) {
+	args[i].indata = in_data + i*(int)((NBEAMS_P/nthreads)*NCHAN_P*NTIMES_P);
+	args[i].inSK = skarray + i*(int)(NBEAMS_P/nthreads*NCHAN_P);
+	args[i].output = lookup_rand;
+	args[i].cnt = flag_counts;
+	args[i].nThreshUp = nThreshUp;
+	args[i].n_threads = nthreads;
+	args[i].thread_id = i;
+	args[i].debug = DEBUG;
+      }
+      if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+      for(int i=0; i<nthreads; i++){
+	if (pthread_create(&threads[i], &attr, &noise_inject, (void *)(&args[i]))) {
+	  syslog(LOG_ERR,"Failed to create noise_inject thread %d\n", i);
+	}
+      }
+      /*for(int i=0; i<nthreads; i++){
+	for(int j=0; j<(int)(NBEAMS_P/nthreads*NCHAN_P*NTIMES_P); i++){
+	  in_data[i*(int)(NBEAMS_P/nthreads*NCHAN_P*NTIMES_P)+j] = args[i].output[j];
+	}
+	}*/
+      pthread_attr_destroy(&attr);
+
+      for(int i=0; i<nthreads; i++){
+	pthread_join(threads[i], &result);
+	if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+      }
+
+      cnt = 0;
+      for(int i=0; i<nthreads; i++) cnt += flag_counts[i];
+      //memcpy(in_data,output,sizeof(in_data));
+    }
+    else{
+      for (int i = 0; i < NBEAMS_P; i++){
+	for (int k = 0; k < NCHAN_P; k++){
+	  if (skarray[i*(int)(NCHAN_P) + k] > nThreshUp){
+	    cnt++;
+	    for (int j = 0; j < NTIMES_P; j++){
+	      in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = 0;
+	    }
+	  }
+	}
+      }
+    }
+    syslog (LOG_INFO,"%d channels*baselines flagged",cnt);
+		
+    // apply baseline correction
+    if (bcorr) {
+      for (int i = 0; i < NBEAMS_P; i++){
+	for (int k = 0; k < NCHAN_P; k++){
+	  for (int j = 0; j < NTIMES_P; j++){
+	    //in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] / (unsigned char)baselinecorrec[i*(int)NCHAN_P+k]);
+	    in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)((double)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]) / baselinecorrec[i*(int)NCHAN_P+k]);
+	  }
+	}
+      }
+      
+      syslog (LOG_DEBUG,"baseline correction applied");
+    }
+		
+    // close block after reading
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    if (DEBUG) syslog(LOG_DEBUG,"closed read block");		
+    
+    written = ipcio_write (hdu_out->data_block, (char *)(in_data), BUF_SIZE);
+    if (written < BUF_SIZE)
+      {
+	syslog(LOG_ERR,"write error");
+	return EXIT_FAILURE;
+      }
+
+    if (DEBUG) syslog (LOG_DEBUG,"write flagged data done.");
+		
+    
+  }
+
+  free(lookup_rand);
+  return 0;    
+} 
diff --git a/legacy/gpu_flagger.cu b/legacy/gpu_flagger.cu
new file mode 100644
index 0000000..07e6f5c
--- /dev/null
+++ b/legacy/gpu_flagger.cu
@@ -0,0 +1,1547 @@
+// -*- c++ -*-
+/*#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+1;95;0c#include <thread>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+*/
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <thread>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/syscall.h>
+#include <syslog.h>
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+
+#include <src/sigproc.h>
+#include <src/header.h>
+
+
+#define NTIMES_P 16384  // # of time samples (assuming 1ms sampling period)
+#define NCHAN_P 1024	// # of channels on BF node side
+#define NBEAMS_P 64	// # of beams on BF side
+#define M_P NTIMES_P
+#define N_P 32
+#define HDR_SIZE 4096
+#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet
+#define NTHREADS_GPU 32
+#define MN 48.0
+#define SIG 6.0
+#define RMAX 16384
+//#define NPERMFLAGS 58
+#define NPERMFLAGS 1
+#define TBIN 128
+#define FBIN 8
+
+// global variables
+int DEBUG = 0;
+//int flagchannels[58] = {737,738,753,754,721,722,723,724,725,726,727,728,729,627,628,629,630,631,632,633,634,603,604,605,606,607,608,609,610,578,579,580,581,582,583,584,585,590,591,592,593,594,595,596,597,598,680,681,682,683,684,685,686,687,688,327,328,329};
+int flagchannels[1] = {10};
+/* global variables */
+int quit_threads = 0;
+int dump_pending = 0;
+int trignum = 0;
+char iP[100];
+char footer_buf[1024];
+char flnam[1024];
+int dumpbm;
+
+// structure for pulse injection
+typedef struct {
+
+  int verbose;
+  float * block;
+
+} dsaX_pulse_t;
+
+
+
+
+// kernel to calculate median spectrum
+// only works on <NTHREADS_GPU/2 in median
+__global__
+void fix_zspec(float * s0, float * v0, int naver) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  int tid;
+  int ct_lt = 0;
+  
+  // sorted place
+  int place = (int)(naver/2);
+
+  // copy into shared memory
+  extern __shared__ float vec[];
+
+  // for mean spec
+  if (thread_id<naver) {
+
+    tid=thread_id;
+    vec[thread_id] = s0[tid*NBEAMS_P*NCHAN_P + block_id];
+
+  }
+
+  // for var spec
+  if (thread_id>=naver && thread_id<2*naver) {
+
+    tid=thread_id-naver;
+    vec[thread_id] = v0[tid*NBEAMS_P*NCHAN_P + block_id];
+
+  }
+
+  __syncthreads();
+
+  if (thread_id<naver) {   
+    for (int i=0;i<naver;i++) {
+      if (i!=thread_id) {
+	if (vec[i]<=vec[thread_id]) ct_lt++;
+      }
+    }
+  }
+
+  if (thread_id>=naver && thread_id<2*naver) {   
+    for (int i=naver;i<2*naver;i++) {
+      if (i!=thread_id) {
+	if (vec[i]<=vec[thread_id]) ct_lt++;
+      }
+    }
+  }
+
+  __syncthreads();
+
+
+  if (thread_id<naver) 
+    if (ct_lt==place) s0[block_id] = vec[thread_id];
+  
+  if (thread_id>=naver && thread_id<2*naver)
+    if (ct_lt==place) v0[block_id] = vec[thread_id];
+  
+}
+
+// kernel to calculate mean spectrum
+// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
+__global__
+void calc_spectrum(unsigned char *data, float * spectrum) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  __shared__ float csum[NTHREADS_GPU];
+  csum[thread_id] = 0.;
+
+  int bm =(int)( block_id/NCHAN_P);
+  int ch = (int)(block_id % (NCHAN_P));
+  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
+  
+  // find sum of local times
+  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
+  for (int tm=0; tm<NTIMES_P/NTHREADS_GPU; tm++) {    
+    csum[thread_id] += (float)(data[idx0]);
+    idx0 += NCHAN_P;
+  }
+
+  __syncthreads();
+  
+  // sum into shared memory
+  if (thread_id<16) {
+    csum[thread_id] += csum[thread_id+16];
+    __syncthreads();
+    csum[thread_id] += csum[thread_id+8];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+4];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+2];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+1];
+      __syncthreads();
+  }
+  /*  
+  int maxn = NTHREADS_GPU/2;
+  int act_maxn = maxn;
+  if (thread_id<maxn) {
+    while (act_maxn>0) {
+      csum[thread_id] += csum[thread_id+act_maxn];
+      act_maxn = (int)(act_maxn/2);
+    }
+  }
+  */
+  
+  if (thread_id==0) {    
+    spectrum[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P);
+  }
+
+}
+
+
+// kernel to calculate variance spectrum
+// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
+__global__
+void calc_varspec(unsigned char *data, float * spectrum, float * varspec) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  __shared__ float csum[NTHREADS_GPU];
+  csum[thread_id] = 0.;
+
+  int bm =(int)( block_id/NCHAN_P);
+  int ch = (int)(block_id % (NCHAN_P));
+  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
+  float val;
+  
+  // find sum of local times
+  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
+  for (int tm=0; tm<NTIMES_P/NTHREADS_GPU; tm++) {    
+    val = (float)(data[idx0]) - spectrum[bm*NCHAN_P + ch];
+    csum[thread_id] += val*val;
+    idx0 += NCHAN_P;
+  }
+  
+  __syncthreads();
+  
+  // sum into shared memory
+  if (thread_id<16) {
+    csum[thread_id] += csum[thread_id+16];
+    __syncthreads();
+    csum[thread_id] += csum[thread_id+8];
+        __syncthreads();
+    csum[thread_id] += csum[thread_id+4];
+        __syncthreads();
+    csum[thread_id] += csum[thread_id+2];
+        __syncthreads();
+    csum[thread_id] += csum[thread_id+1];
+        __syncthreads();
+  }
+  /*
+  int maxn = NTHREADS_GPU/2;
+  int act_maxn = maxn;
+  if (thread_id<maxn) {
+    while (act_maxn>0) {
+      csum[thread_id] += csum[thread_id+act_maxn];
+      act_maxn = (int)(act_maxn/2);
+    }
+    }*/
+
+  if (thread_id==0) {    
+    varspec[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P);
+  }
+
+}
+
+// kernel to calculate maximum value
+// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
+__global__
+void calc_maxspec(unsigned char *data, float * maxspec) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  __shared__ float csum[NTHREADS_GPU];
+  csum[thread_id] = 0.;
+
+  int bm =(int)( block_id/NCHAN_P);
+  int ch = (int)(block_id % (NCHAN_P));
+  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
+  float val=0.;
+  
+  // find max of local times
+  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
+  for (int i=idx0;i<idx0+NCHAN_P*(NTIMES_P/NTHREADS_GPU);i+=NCHAN_P) {
+    if ((float)(data[i])>val) val = (float)(data[i]);
+  }
+  csum[thread_id] = val;
+  
+  __syncthreads();
+  
+  // sum into shared memory
+  int maxn = NTHREADS_GPU/2;
+  int act_maxn = maxn;
+  if (thread_id<maxn) {
+    while (act_maxn>0) {
+      if (csum[thread_id]<csum[thread_id+act_maxn])
+	csum[thread_id]=csum[thread_id+act_maxn];
+      act_maxn = (int)(act_maxn/2);
+    }
+  }
+
+  if (thread_id==0) {    
+    maxspec[bm*NCHAN_P+ch] = csum[thread_id];
+  }
+
+}
+
+// kernel to calculate p-p spec with binning (default 128)
+// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
+__global__
+void calc_ppspec(unsigned char *data, float * ppspec) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  __shared__ float csum[NTHREADS_GPU];
+  csum[thread_id] = 0.;
+
+  int bm =(int)( block_id/NCHAN_P);
+  int ch = (int)(block_id % (NCHAN_P));
+  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
+  float val=0.;
+
+  // local times start at tm0
+  float vv;
+  int idx0;
+  
+  // find max of local times
+  for (int j=0;j<(NTIMES_P/NTHREADS_GPU)/TBIN;j++) {
+    idx0=bm*NTIMES_P*NCHAN_P + (tm0+j*TBIN)*NCHAN_P + ch;
+    vv = 0.;    
+    for (int i=idx0;i<idx0+NCHAN_P*TBIN;i+=NCHAN_P) 
+      vv += (float)(data[i]);
+    vv /= (1.*TBIN);      
+    if (vv>val) val = vv;
+  }
+  csum[thread_id] = val;
+  
+  __syncthreads();
+  
+  // sum into shared memory
+  int maxn = NTHREADS_GPU/2;
+  int act_maxn = maxn;
+  float v1;
+  if (thread_id<maxn) {
+    while (act_maxn>0) {
+      if (csum[thread_id]<csum[thread_id+act_maxn])
+	csum[thread_id]=csum[thread_id+act_maxn];
+      act_maxn = (int)(act_maxn/2);
+    }
+  }
+  if (thread_id==0) v1=csum[thread_id];
+  act_maxn = maxn;
+  if (thread_id<maxn) {
+    while (act_maxn>0) {
+      if (csum[thread_id]>csum[thread_id+act_maxn])
+	csum[thread_id]=csum[thread_id+act_maxn];
+      act_maxn = (int)(act_maxn/2);
+    }
+  }
+  if (thread_id==0)
+    ppspec[bm*NCHAN_P+ch] = v1-csum[thread_id];
+
+}
+
+
+// kernel to scale data
+// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads
+__global__
+void scaley(unsigned char *data, float *spectrum, float *varspec) {
+
+  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
+  int bm = (int)(idx / (NTIMES_P*NCHAN_P));
+  int ch = (int)(idx % NCHAN_P);
+  int spidx = bm*NCHAN_P+ch;
+
+  float val = (float)(data[idx]);
+  val = (val-spectrum[spidx])*(SIG/sqrtf(varspec[spidx])) + MN;
+  data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2);
+  
+
+}
+
+// kernel to add pulse to data
+// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads
+__global__
+void sumpulse(unsigned char *data, float *summand) {
+
+  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
+  float val = (float)(data[idx]);
+  val += summand[idx];
+  data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2);
+  
+}
+
+
+
+
+// kernel to make time series from data
+// run with NBEAMS_P*NTIMES_P blocks of 32 threads
+__global__
+void make_ts(unsigned char *data, float *ts) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
+  int bm = (int)(blockIdx.x/NTIMES_P);
+  int tm = (int)(blockIdx.x % NTIMES_P);
+  int ch0 = (int)(thread_id*(NCHAN_P/NTHREADS_GPU));
+
+  __shared__ float csum[NTHREADS_GPU];
+  csum[thread_id] = 0.;
+  
+  // find sum of local chans
+  int idx0 = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch0;
+  for (int ch=0; ch<NCHAN_P/NTHREADS_GPU; ch++) {    
+    csum[thread_id] += (float)(data[idx0]);
+    idx0++;
+  }
+
+  __syncthreads();
+  
+  // sum into shared memory
+  if (thread_id<16) {
+    csum[thread_id] += csum[thread_id+16];
+    __syncthreads();
+    csum[thread_id] += csum[thread_id+8];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+4];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+2];
+      __syncthreads();
+    csum[thread_id] += csum[thread_id+1];
+      __syncthreads();
+  }
+  
+  if (thread_id==0) {    
+    ts[bm*NTIMES_P+tm] = csum[thread_id] / (1.*NCHAN_P);
+  }
+  
+}
+
+
+// kernel to do flagging
+// launch with n_mask*NTIMES_P/NTHREADS_GPU blocks of NTHREADS_GPU threads 
+__global__
+void flag(unsigned char *data, int * midx, unsigned char *repval, float *bpwr) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  int midx_idx = (int)(block_id/(NTIMES_P/NTHREADS_GPU));
+  
+  int bm = (int)(midx[midx_idx] / NCHAN_P);
+  int ch = (int)(midx[midx_idx] % NCHAN_P);
+  int tm = ((int)(block_id % (NTIMES_P/NTHREADS_GPU)))*NTHREADS_GPU + thread_id;
+  int idx = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch;  
+
+  // do replacement
+  //data[idx] = repval[ch*NTIMES_P+tm]*bpwr[bm];
+  data[idx] = MN*bpwr[bm];
+    
+}
+
+// kernel to do time-series flagging
+// launch with n_mask*(NCHAN_P-256)/NTHREADS_GPU blocks of NTHREADS_GPU threads 
+__global__
+void flagts(unsigned char *data, int * midx, unsigned char *repval, float *bpwr) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  int midx_idx = (int)(block_id/((NCHAN_P-256)/NTHREADS_GPU));
+  
+  int bm = (int)(midx[midx_idx] / NTIMES_P);
+  int tm = (int)(midx[midx_idx] % NTIMES_P);
+  int ch = ((int)(block_id % ((NCHAN_P-256)/NTHREADS_GPU)))*NTHREADS_GPU + thread_id + 128;
+  int idx = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch;  
+
+  // do replacement
+  //data[idx] = repval[ch*NTIMES_P+tm]*bpwr[bm];
+  data[idx] = MN*bpwr[bm];
+    
+}
+
+
+// kernel to make random numbers
+// launch with NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads 
+__global__
+void genrand(unsigned char *repval, unsigned int seed) {
+
+  int block_id = blockIdx.x;
+  int thread_id = threadIdx.x;
+  
+  // for random number
+  curandState_t state;
+  float u1, u2, va;
+  curand_init(seed, block_id*NTHREADS_GPU+thread_id, 1, &state);
+  u1 = ((float)(curand(&state) % RMAX))/(1.*RMAX);
+  u2 = ((float)(curand(&state) % RMAX))/(1.*RMAX);
+  va = sqrtf(-2.*logf(u1))*cosf(2.*M_PI*u2);
+
+  // do replacement
+  repval[block_id*NTHREADS_GPU+thread_id] = (unsigned char)(__float2uint_rn(2.*(va*SIG+MN))/2);
+    
+}
+
+
+
+// assumed spec has size NBEAMS_P*NCHAN_P
+// ref is reference value
+void genmask(float *spec, float thresh, float ref, int *mask) {
+
+  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
+    if (fabs(spec[i]-ref)>thresh) mask[i] = 1;
+  }
+
+}
+
+
+
+float medval(float *a,int n);
+
+float medval(float *a,int n) { 
+  int i,j;
+  float tmp[n], tt;
+  for (i = 0;i < n;i++)
+    tmp[i] = a[i];
+  
+  for(i = 0;i < n-1;i++) {
+    for(j = 0;j < n-i-1;j++) {
+      if(tmp[j] > tmp[j+1]) {
+
+	tt = tmp[j+1];
+	tmp[j+1] = tmp[j];
+	tmp[j] = tt;
+
+      }
+    }
+  }
+
+  return tmp[(int)((n+1)/2-1)];
+}
+
+void channflag(float* spec, float Thr, int * mask);
+void simple_channflag(float* spec, float Thr, int * mask);
+void simple_tsflag(float* ts, float Thr, int * mask);
+
+void simple_channflag(float* spec, float Thr, int * mask) {
+	
+  int i, j;
+  float* medspec;			// median values for each beam spectrum
+  float* madspec;			// mad for each beam spectrum
+  float* normspec;			// corrected spec - median value (for MAD calculation)
+
+  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+    
+  int ZeroChannels = 128; 
+  int nFilt, idx;
+  	
+  // calculate median value for each beam
+  for (i = 0; i < NBEAMS_P; i++)
+    medspec[i] = medval(spec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels);
+  
+  // compute MAD for each beam
+  for (i = 0; i < NBEAMS_P; i++){
+    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
+      normspec[j-ZeroChannels] = fabs(spec[i*NCHAN_P+j]-medspec[i]);
+    }
+    madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels);
+  }
+	
+  // mask
+  float vv;
+  float mythr = Thr/sqrt(1.*FBIN);
+  for (i = 0; i < NBEAMS_P; i++){
+
+    // implement FBIN    
+    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels-FBIN; j++) {
+      vv = 0.;
+      for (int k=0;k<FBIN;k++)
+	vv += spec[i*NCHAN_P+j];
+      vv = (vv/(1.*FBIN)-medspec[i]);
+
+      if (vv > mythr*madspec[i]) mask[i*NCHAN_P+j] = 1;
+      
+    }
+    
+  }
+  
+  free(medspec);
+  free(madspec);
+  free(normspec);
+  
+}
+
+void simple_tsflag(float* spec, float Thr, int * mask) {
+	
+  int i, j;
+  float* medspec;			// median values for each beam spectrum
+  float* madspec;			// mad for each beam spectrum
+  float* normspec;			// corrected spec - median value (for MAD calculation)
+
+  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P);
+    
+  int nFilt, idx;
+  	
+  // calculate median value for each beam
+  for (i = 0; i < NBEAMS_P; i++)
+    medspec[i] = medval(spec + i*NTIMES_P,NTIMES_P/16);
+  
+  // compute MAD for each beam
+  for (i = 0; i < NBEAMS_P; i++){
+    for (j = 0; j < NTIMES_P/16; j++){
+      normspec[j] = fabs(spec[i*NTIMES_P+j]-medspec[i]);
+    }
+    madspec[i] = medval(normspec,NTIMES_P/16);
+  }
+	
+  // mask
+  float vv;
+  float mythr = Thr;
+  for (i = 0; i < NBEAMS_P; i++){
+
+    for (j = 0; j < NTIMES_P; j++) {
+
+      vv = spec[i*NTIMES_P+j]-medspec[i];
+      if (vv > mythr*madspec[i]) mask[i*NTIMES_P+j] = 1;
+      
+    }
+    
+  }
+  
+  free(medspec);
+  free(madspec);
+  free(normspec);
+  
+}
+
+
+void channflag(float* spec, float Thr, int * mask) {
+	
+  int i, j;
+  float* baselinecorrec;	// baseline correction
+  float* CorrecSpec;			// corrected spectrum
+  float* medspec;			// median values for each beam spectrum
+  float* madspec;			// mad for each beam spectrum
+  float* normspec;			// corrected spec - median value (for MAD calculation)
+
+  baselinecorrec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  CorrecSpec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
+  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  
+  
+  int ZeroChannels = 128; 
+  int nFiltSize = 21;
+  int nFilt, idx;
+  
+  // calculate median filtered spectrum
+  for (i=0;i<NBEAMS_P;i++) {
+    for (j=ZeroChannels;j<NCHAN_P-ZeroChannels;j++) {
+      
+      if (NCHAN_P-ZeroChannels-j>=nFiltSize)
+	CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+j,nFiltSize);
+      else
+	CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+NCHAN_P-ZeroChannels-nFiltSize,nFiltSize);
+
+    }
+  }
+	
+  // calculate median value for each beam
+  for (i = 0; i < NBEAMS_P; i++)
+    medspec[i] = medval(CorrecSpec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels);
+  
+  // compute MAD for each beam
+  for (i = 0; i < NBEAMS_P; i++){
+    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
+      normspec[j-ZeroChannels] = fabs(CorrecSpec[i*NCHAN_P+j]-medspec[i]);
+    }
+    madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels);
+  }
+	
+  // mask  
+  for (i = 0; i < NBEAMS_P; i++){    
+    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
+      if (CorrecSpec[i*NCHAN_P+j] > Thr * madspec[i] || CorrecSpec[i*NCHAN_P+j] < - Thr * madspec[i])
+	mask[i*NCHAN_P+j] = 1;
+
+      // for permanent flagging
+      for (int kk=0;kk<NPERMFLAGS;kk++) {
+	if (j==flagchannels[kk]) mask[i*NCHAN_P+j] = 1;
+      }
+      
+    }
+    
+  }
+  
+  free(baselinecorrec);
+  free(CorrecSpec);
+  free(medspec);
+  free(madspec);
+  free(normspec);
+  
+}
+
+
+// to gather mask indices
+void gather_mask(int *h_idx, int *h_mask, int *n_mask) {
+
+  (*n_mask) = 0;
+  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
+    if (h_mask[i]==1) {      
+      h_idx[(*n_mask)] = i;
+      //if (DEBUG) syslog(LOG_INFO,"%d %d %d",i,h_mask[i],(*n_mask));
+      (*n_mask) += 1;
+    }
+  }
+
+}
+
+// to gather ts mask indices
+void gather_tsmask(int *h_idx, int *h_mask, int *n_mask) {
+
+  (*n_mask) = 0;
+  for (int i=0;i<NBEAMS_P*NTIMES_P;i++) {
+    if (h_mask[i]==1) {      
+      h_idx[(*n_mask)] = i;
+      //if (DEBUG) syslog(LOG_INFO,"%d %d %d",i,h_mask[i],(*n_mask));
+      (*n_mask) += 1;
+    }
+  }
+
+}
+
+
+// to calculate bpwr from spectrum
+void calc_bpwr(float *h_spec, float *h_bpwr);
+void calc_bpwr(float *h_spec, float *h_bpwr) {
+
+  for (int i=0;i<NBEAMS_P;i++) {
+    h_bpwr[i] = 0.;
+    for (int j=0;j<NCHAN_P;j++) 
+      h_bpwr[i] += h_spec[i*NCHAN_P+j];
+    h_bpwr[i] = (h_bpwr[i]/(1.*(NCHAN_P-256)))/MN;
+    
+  }
+
+}
+
+// to medianise zero specs
+void median_calc(float * arr);
+void median_calc(float * arr) {
+
+  int stride = NCHAN_P;
+  float tt;
+  
+  for (int chan=0;chan<NCHAN_P;chan++) {
+
+    for(int i = 0;i < NBEAMS_P-1;i++) {
+      for(int j = 0;j < (NBEAMS_P-i-1);j++) {
+
+	if(arr[j*stride+chan] > arr[(j+1)*stride+chan]) {
+
+	  tt = arr[(j+1)*stride+chan];
+	  arr[(j+1)*stride+chan] = arr[(j)*stride+chan];
+	  arr[(j)*stride+chan] = tt;
+
+	}
+      }
+    }
+
+  }
+
+  for (int i=0;i<NCHAN_P;i++)
+    arr[i] = arr[i+31*NCHAN_P];
+
+  for (int j=1;j<NBEAMS_P;j++) {
+    for (int i=0;i<NCHAN_P;i++)
+      arr[j*NCHAN_P + i] = arr[i];
+  }
+
+}
+
+// Thread to control the adding of filterbanks
+void control_thread (dsaX_pulse_t * ctx) {
+
+  syslog(LOG_INFO, "control_thread: starting");
+
+  // buffer for incoming command strings, and setup of socket
+  int bufsize = 1024;
+  char* buffer = (char *) malloc (sizeof(char) * bufsize);
+  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
+  memset(buffer, '\0', bufsize);
+  const char* whitespace = " ";
+  char * command = 0;
+  char * args = 0;
+  double * tmpblock = (double *)malloc(sizeof(double)*NTIMES_P*NCHAN_P);
+
+  struct addrinfo hints;
+  struct addrinfo* res=0;
+  memset(&hints,0,sizeof(hints));
+  struct sockaddr_storage src_addr;
+  socklen_t src_addr_len=sizeof(src_addr);
+  hints.ai_family=AF_INET;
+  hints.ai_socktype=SOCK_DGRAM;
+  getaddrinfo(iP,"11228",&hints,&res);
+  int fd;
+  ssize_t ct;
+  char tmpstr;
+  char cmpstr = 'p';
+  char *endptr;
+  uint64_t tmps;
+  char * token;
+  double maxval;
+
+  FILE *fin;
+  
+  while (!quit_threads) {
+    
+    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
+    bind(fd,res->ai_addr,res->ai_addrlen);
+    memset(buffer,'\0',sizeof(buffer));
+    syslog(LOG_INFO, "control_thread: waiting for packet");
+    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
+    
+    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
+    strcpy(tbuf,buffer);
+    trignum++;
+
+    // interpret buffer string    
+    char * rest = buffer;
+    int tmp_dumpbm = (float)(strtof(strtok(rest, "-"),&endptr));
+    if (tmp_dumpbm<0 || tmp_dumpbm>63) tmp_dumpbm=32;
+    char * tmp_flnam = strtok(NULL, "-");
+    
+    if (!dump_pending) {
+      strcpy(flnam,tmp_flnam);
+      dumpbm = tmp_dumpbm;
+      syslog(LOG_INFO, "control_thread: received command to add pulse %s to beam %d",flnam,dumpbm);
+      if (!(fin=fopen(flnam,"rb"))) {
+	syslog(LOG_INFO,"cannot open %s",flnam);
+      }
+      else {
+	fread(tmpblock,sizeof(double),1024*16384,fin);
+
+	// do manipulation of data
+	maxval = 0.;
+	for (int i=0;i<16384*1024;i++) {
+	  if (tmpblock[i]>maxval) maxval = tmpblock[i];
+	}
+	for (int i=0;i<16384;i++) {
+	  for (int j=0;j<1024;j++) {
+	    //ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]*2.*SIG/maxval);
+	    ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]);
+	  }
+	}
+	
+	fclose(fin);
+	syslog(LOG_INFO, "control_thread: finished processing pulse - setting dump_pending");
+      }
+    }
+	
+    if (dump_pending) {
+      syslog(LOG_ERR, "control_thread: BACKED UP - ignoring %s",tbuf);
+    }
+  
+    if (!dump_pending) dump_pending = 1;
+    
+    close(fd);
+    
+  }
+
+  free (buffer);
+  free (tbuf);
+  free(tmpblock);
+
+  if (ctx->verbose)
+    syslog(LOG_INFO, "control_thread: exiting");
+
+}
+
+
+void usage()
+{
+  fprintf (stdout,
+	   "flagger [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default dada]\n"
+	   " -o out_key [default caca]\n"
+	   " -t flagging threshold [default 5.0]\n"
+	   " -f output spectra file\n"
+	   " -g output beam power file\n"
+	   " -n number of blocks in baseline spec aver (must be <=16 and >=1, default 5)\n"
+	   " -p adjust noise level according to power\n"
+	   " -m generate random data\n"
+	   " -s time-series flagging and threshold [no default]\n"
+	   " -q modulation index threshold for tot pwr flagging [default 0.0005]\n"
+	   " -h print usage\n");
+}
+
+
+int main(int argc, char**argv)
+{
+
+  // syslog start
+  openlog ("gpu_flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  // set cuda device
+  cudaSetDevice(1);
+  
+  // read command line args
+
+  // data block HDU keys
+  key_t in_key = 0x0000dada;
+  key_t out_key = 0x0000caca;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  double thresh = 5.0;
+  float mod_thresh = 0.0005;
+  int naver = 5;
+  char * fnam;
+  char * fnam2;
+  FILE *fout;
+  FILE *fout2;
+  FILE *f0;
+  
+  fnam = (char *)malloc(sizeof(char)*200);
+  fnam2 = (char *)malloc(sizeof(char)*200);
+  int fwrite = 0;
+  int fwrite2 = 0;
+  int pwr = 0;
+  int mkrand = 0;
+  int tsflag = 0;
+  float tsthresh = 10.;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:f:g:a:k:s:mdph")) != -1)
+    {
+      switch (arg)
+	{
+	case 'k':
+	  strcpy(iP,optarg);
+	  break;	
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+	    {
+	      strcpy(fnam,optarg);
+	      fwrite = 1;
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'q':
+	  if (optarg)
+	    {
+	      mod_thresh = atof(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-q flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'a':
+	  if (optarg)
+	    {
+	      naver = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'g':
+	  if (optarg)
+	    {
+	      //strcpy(fnam2,optarg);
+	      sprintf(fnam2,"%s_%f.dat",optarg,40587.0+time(NULL)/86400.0);
+	      fwrite2 = 1;
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      thresh = atof(optarg);
+	      syslog(LOG_INFO,"modified THRESH to %g",thresh);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 's':
+	  if (optarg)
+	    {
+	      tsthresh = atof(optarg);
+	      tsflag=1;
+	      syslog(LOG_INFO,"TSTHRESH is %g",tsthresh);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-s flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'p':
+	  pwr=1;
+	  break;
+	case 'm':
+	  mkrand=1;
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  dsaX_pulse_t udpdb;
+  udpdb.verbose = DEBUG;
+  float * pulsedata = (float *)malloc(sizeof(float)*256*16384*1024);
+  udpdb.block = pulsedata;
+  
+  // CONNECT AND READ FROM BUFFER
+
+  dada_hdu_t* hdu_in = 0;	// header and data unit
+  hdu_in  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to input buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to input buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (DEBUG) syslog(LOG_INFO,"connected to input buffer");
+  
+  uint64_t header_size = 0;
+  // read the header from the input HDU
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  
+  // mark the input header as cleared
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){
+    syslog (LOG_ERR,"could not mark header as cleared");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t block_id, bytes_read = 0;
+  unsigned char *in_data;
+  char *cin_data;
+	     	
+  // OUTPUT BUFFER
+  dada_hdu_t* hdu_out = 0;
+  hdu_out  = dada_hdu_create ();
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"flagged_data: could not connect to dada buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write (hdu_out) < 0) {
+    syslog (LOG_ERR,"flagged_data: could not lock to dada buffer");
+    return EXIT_FAILURE;
+  }
+
+  if (DEBUG) syslog(LOG_INFO,"connected to output");
+  
+  
+  //// OUTPUT BUFFER
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  header_size = HDR_SIZE;
+  if (!header_out)
+    {
+      syslog(LOG_ERR,"couldn't read header_out");
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      return EXIT_FAILURE;
+    }
+  uint64_t written=0;
+
+  if (DEBUG) syslog(LOG_INFO,"copied header");
+  
+  ////////////////		
+
+  // declare stuff for host and GPU
+  unsigned char * d_data;
+  float * d_pulse;
+  unsigned char * h_bm0 = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P);
+  cudaMalloc((void **)&d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char));
+  cudaMalloc((void **)&d_pulse, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float));
+  unsigned char * h_data = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P);
+  int * h_mask = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P);
+  int * d_mask;
+  cudaMalloc((void **)&d_mask, NBEAMS_P*NCHAN_P*sizeof(int));
+  int * h_tsmask = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P);
+  int * d_tsmask;
+  cudaMalloc((void **)&d_tsmask, NBEAMS_P*NTIMES_P*sizeof(int));
+  float * d_spec, * d_oldspec;
+  cudaMalloc((void **)&d_spec, NBEAMS_P*NCHAN_P*sizeof(float));
+  cudaMalloc((void **)&d_oldspec, NBEAMS_P*NCHAN_P*sizeof(float));
+  float * d_ts;
+  cudaMalloc((void **)&d_ts, NBEAMS_P*NTIMES_P*sizeof(float));
+  float * h_bpwr = (float *)malloc(sizeof(float)*NBEAMS_P);
+  float * d_bpwr;
+  cudaMalloc((void **)&d_bpwr, NBEAMS_P*sizeof(float));
+  float * h_spec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float * h_ts = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P);
+  float * h_beam = (float *)malloc(sizeof(float)*NBEAMS_P);
+  float * h_bmask = (float *)malloc(sizeof(float)*NBEAMS_P);
+  float * h_subspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float * h_var = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float * h_max = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float * h_pp = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float * h_oldspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float *h_spec0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float *h_var0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
+  float *d_spec0, *d_var0;
+  cudaMalloc((void **)&d_spec0, NBEAMS_P*NCHAN_P*naver*sizeof(float));
+  cudaMalloc((void **)&d_var0, NBEAMS_P*NCHAN_P*naver*sizeof(float));
+  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) h_oldspec[i] = 0.;
+  cudaMemcpy(d_oldspec, h_oldspec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyHostToDevice);
+  float * d_var, * d_max, * d_pp;
+  cudaMalloc((void **)&d_var, NBEAMS_P*NCHAN_P*sizeof(float));
+  cudaMalloc((void **)&d_max, NBEAMS_P*NCHAN_P*sizeof(float));
+  cudaMalloc((void **)&d_pp, NBEAMS_P*NCHAN_P*sizeof(float));
+  int * h_idx = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P);
+  int * d_idx;
+  cudaMalloc((void **)&d_idx, NBEAMS_P*NCHAN_P*sizeof(int));
+  int * h_tsidx = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P);
+  int * d_tsidx;
+  cudaMalloc((void **)&d_tsidx, NBEAMS_P*NTIMES_P*sizeof(int));
+  int n_mask = 0;
+  int n_tsmask = 0;
+  float prev_tpwr = 0., tpwr = 0.;
+
+  // random numbers
+  unsigned char *d_repval;
+  cudaMalloc((void **)&d_repval, NTIMES_P*NCHAN_P*sizeof(unsigned char));
+  genrand<<<NTIMES_P*NCHAN_P/NTHREADS_GPU,NTHREADS_GPU>>>(d_repval,time(NULL));
+  for (int i=0;i<NBEAMS_P;i++) h_bpwr[i] = 1.;
+  syslog(LOG_INFO,"done with repvals");
+
+  // start control thread
+  /*int rval = 0;
+  pthread_t control_thread_id;
+  syslog(LOG_INFO, "starting control_thread()");
+  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
+  if (rval != 0) {
+    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
+    return -1;
+    }*/
+  std::thread threadObj(control_thread, &udpdb);
+
+  
+  // for pre-start
+  unsigned char * tmp_indata = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P);
+  for (int i=0;i<NBEAMS_P;i++)
+    cudaMemcpy(tmp_indata, d_repval, NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+  int prestart = 2;
+  int gotDada = 0;
+  
+  int started = 0;
+  int blockn = 0;
+  
+  // put rest of the code inside while loop
+  while (1) {	
+    
+    // read a DADA block
+    if (prestart==0)  {
+      cin_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+      in_data = (unsigned char *)(cin_data);
+      gotDada=1;
+      blockn++;
+    }
+    else
+      in_data = (unsigned char *)(tmp_indata);
+
+    // deal with bm0
+    /*memcpy(h_data+NTIMES_P*NCHAN_P,in_data+NTIMES_P*NCHAN_P,(NBEAMS_P-1)*NTIMES_P*NCHAN_P);
+    memcpy(h_bm0,in_data,NTIMES_P*NCHAN_P);
+    memcpy(h_data,h_data+NTIMES_P*NCHAN_P,NTIMES_P*NCHAN_P);*/
+    
+
+    if (DEBUG) syslog(LOG_INFO,"read block");
+
+    /* 
+       if not first block, correct data
+       1 - measure spectrum
+       2 - measure varspec
+       if first block, proceed.
+       else
+       3 - measure maximum value
+       4 - use three spectra to derive channel flags
+       5 - flag
+     */
+
+    // copy data to device
+    cudaMemcpy(d_data, in_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyHostToDevice);
+    //cudaMemset(d_data, 8, NBEAMS_P*NTIMES_P*NCHAN_P);
+
+    // if not first block, correct data
+    if (started==1 || prestart==1) 
+      scaley<<<NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU,NTHREADS_GPU>>>(d_data, d_spec0, d_var0);
+
+    if (DEBUG) syslog(LOG_INFO,"copied data and scaled");
+    
+    // measure spectrum and varspec
+    calc_spectrum<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_spec);
+    calc_varspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_spec, d_var);
+    cudaMemcpy(h_spec, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_var, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
+    if (started==0) {
+      for (int i=0;i<NBEAMS_P;i++) {
+	for (int j=0;j<NCHAN_P;j++) prev_tpwr += h_spec[i*NCHAN_P+j];
+      }
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"done spec and var");
+    
+    // if not first block
+    if (started==1 || prestart==1) {
+
+      // do total power check
+      tpwr = 0.;
+      for (int i=0;i<NBEAMS_P;i++) {
+	for (int j=0;j<NCHAN_P;j++) tpwr += h_spec[i*NCHAN_P+j];
+      }
+
+      if (fabs(tpwr-prev_tpwr)/prev_tpwr >= mod_thresh) {
+
+	syslog(LOG_INFO,"mod_idx %f (threshold %f), noise replacement",fabs(tpwr-prev_tpwr)/prev_tpwr,mod_thresh);
+
+	for (int i=0;i<NBEAMS_P;i++)
+	  cudaMemcpy(d_data + i*NTIMES_P*NCHAN_P,d_repval,NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToDevice);	
+
+      }
+      else {
+             
+	// calc maxspec
+	calc_spectrum<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_max);
+	calc_ppspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_pp);
+
+	// derive channel flags
+	cudaMemcpy(h_max, d_max, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_pp, d_pp, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
+	for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
+	  h_mask[i] = 0;
+	  h_subspec[i] = h_spec[i]-h_oldspec[i];
+	}
+	channflag(h_subspec,thresh,h_mask);
+	channflag(h_var,thresh+0.5,h_mask);
+	channflag(h_max,thresh,h_mask);
+	simple_channflag(h_pp,thresh,h_mask);
+	
+	// calc bpwr if needed
+	if (pwr) calc_bpwr(h_spec,h_bpwr);
+	cudaMemcpy(d_bpwr, h_bpwr, NBEAMS_P*sizeof(float), cudaMemcpyHostToDevice);
+	
+	// apply mask
+	gather_mask(h_idx, h_mask, &n_mask);
+	if (DEBUG) syslog(LOG_INFO,"FLAG_COUNT %d",n_mask);   		
+	cudaMemcpy(d_idx, h_idx, n_mask*sizeof(int), cudaMemcpyHostToDevice);
+	
+	// replace with random data
+	if (mkrand==1) {
+	  for (int i=0;i<NBEAMS_P;i++)
+	    cudaMemcpy(d_data + i*NTIMES_P*NCHAN_P,d_repval,NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToDevice);
+	}
+	
+	
+	// check whether we want to add pulse
+	if (dump_pending) {
+	  
+	  syslog(LOG_INFO, "adding pulse %s to beam %d", flnam, dumpbm);
+	  cudaMemset(d_pulse, 0, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float));
+	  cudaMemcpy(d_pulse + dumpbm*NTIMES_P*NCHAN_P,pulsedata,NTIMES_P*NCHAN_P*sizeof(float), cudaMemcpyHostToDevice);
+	  sumpulse<<<NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_pulse);
+	  syslog(LOG_INFO, "added %s to beam %d", flnam, dumpbm);
+	  
+	  dump_pending=0;
+	  
+	}
+	
+	if (mkrand==0) 
+	  flag<<<n_mask*NTIMES_P/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_idx, d_repval, d_bpwr);
+	
+	// ts flagging if needed
+	if (tsflag) {
+	  
+	  make_ts<<<NBEAMS_P*NTIMES_P,NTHREADS_GPU>>>(d_data,d_ts);
+	  syslog(LOG_INFO,"made ts");
+	  cudaMemcpy(h_ts, d_ts, NBEAMS_P*NTIMES_P*sizeof(float), cudaMemcpyDeviceToHost);
+	  syslog(LOG_INFO,"copied ts");
+	  for (int i=0;i<NBEAMS_P*NTIMES_P;i++) 
+	    h_tsmask[i] = 0;
+	  simple_tsflag(h_ts,tsthresh,h_tsmask);
+	  syslog(LOG_INFO,"tsflagged");
+	  gather_tsmask(h_tsidx, h_tsmask, &n_tsmask);	
+	  syslog(LOG_INFO,"TS_COUNT %d",n_tsmask);   		
+	  cudaMemcpy(d_tsidx, h_tsidx, n_tsmask*sizeof(int), cudaMemcpyHostToDevice);
+	  flagts<<<n_tsmask*(NCHAN_P-256)/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_tsidx, d_repval, d_bpwr);
+	  syslog(LOG_INFO,"flagged ts");
+	  
+	}
+      
+      }
+
+    }
+
+    // deal with tpwr
+    prev_tpwr = tpwr;
+    
+    // copy data to host and write to buffer
+    cudaMemcpy(h_data, d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    
+    // deal with bm0
+    //memcpy(h_data,h_bm0,NTIMES_P*NCHAN_P);
+    
+    // close block after reading
+    if (prestart==0) {
+      ipcio_close_block_read (hdu_in->data_block, bytes_read);
+      if (DEBUG) syslog(LOG_DEBUG,"closed read block");		    
+      written = ipcio_write (hdu_out->data_block, (char *)(h_data), BUF_SIZE);
+      if (written < BUF_SIZE)
+	{
+	  syslog(LOG_ERR,"write error");
+	  return EXIT_FAILURE;
+	}
+    }
+
+    if (prestart==1) {
+      syslog(LOG_INFO,"Finishing with pre-start run-through");
+      prestart=0;
+
+      // search for spec0 and var0 file
+      if (f0=fopen("/home/ubuntu/data/specvar0.dat","r")) {
+
+	//f0=fopen("/home/ubuntu/data/specvar0.dat","r");
+	for (int i=0;i<NBEAMS_P*NCHAN_P;i++)
+	  fscanf(f0,"%f %f\n",&h_spec0[i],&h_var0[i]);
+	fclose(f0);
+	cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
+	started=1;
+	syslog(LOG_INFO,"Read init weight from file");
+
+      }
+      
+    }
+
+    
+    // deal with started and oldspec
+    if (started==0 || prestart==2) {
+      if (gotDada==1 || prestart==2) {
+	if (blockn>0) {
+	  cudaMemcpy(d_spec0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
+	  cudaMemcpy(d_var0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
+	}
+	if (blockn==0) {
+	  cudaMemcpy(d_spec0, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
+	  cudaMemcpy(d_var0, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
+	}
+      }
+      if (prestart==0 && gotDada==1 && blockn >= naver) {
+	started=1;
+	if (naver>1) fix_zspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU, 2*naver*sizeof(float)>>>(d_spec0, d_var0, naver);
+	cudaMemcpy(h_spec0, d_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_var0, d_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost);
+	median_calc(h_spec0);
+	median_calc(h_var0);
+	cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
+	syslog(LOG_INFO,"writing out weights...");
+	
+	// write out weights
+	f0=fopen("/home/ubuntu/data/specvar.dat","w");
+	for (int i=0;i<NBEAMS_P*NCHAN_P;i++)
+	  fprintf(f0,"%f %f\n",h_spec0[i],h_var0[i]);
+	fclose(f0);
+
+	
+      }
+      else if (prestart==2 && gotDada==0) {
+	prestart=1;
+	syslog(LOG_INFO,"Pre-starting");
+      }
+    }
+      
+    for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
+      h_oldspec[i] = h_spec[i];
+    }
+    
+    if (fwrite && prestart==0 && started==1) {
+      fout=fopen(fnam,"a");      
+      for (int i=0;i<NCHAN_P*NBEAMS_P;i++) fprintf(fout,"%d %g %g %g\n",h_mask[i],h_subspec[i],h_var[i],h_max[i]);
+      fclose(fout);
+    }
+    if (fwrite2) {
+      fout2=fopen(fnam2,"a");
+      for (int i=0;i<NBEAMS_P;i++) {
+	h_beam[i] = 0.;
+	h_bmask[i] = 0.;
+	for (int j=0;j<NCHAN_P;j++) {
+	  h_beam[i] += h_spec[i*NCHAN_P+j];
+	  h_bmask[i] += 1.*h_mask[i*NCHAN_P+j];
+	}
+	fprintf(fout2,"%g %g\n",h_beam[i],h_bmask[i]);
+      }
+      fclose(fout2);
+    }
+
+    if (DEBUG) syslog(LOG_INFO,"done with round");
+    
+
+  }
+
+  // close control thread
+  syslog(LOG_INFO, "joining control_thread");
+  quit_threads = 1;
+  threadObj.join();
+
+  free(fnam);
+  free(fnam2);
+  free(fout2);
+  free(h_data);
+  free(h_mask);
+  free(h_beam);
+  free(h_bmask);
+  free(h_spec);
+  free(h_var);
+  free(h_pp);
+  free(h_max);
+  free(h_bm0);
+  free(h_bpwr);
+  free(h_ts);
+  free(h_tsidx);
+  free(h_tsmask);
+  free(pulsedata);
+  cudaFree(d_ts);
+  cudaFree(d_tsidx);
+  cudaFree(d_tsmask);
+  cudaFree(d_bpwr);
+  cudaFree(d_max);
+  cudaFree(d_pp);
+  cudaFree(d_data);
+  cudaFree(d_spec);
+  cudaFree(d_var);
+  cudaFree(d_mask);
+  cudaFree(d_spec0);
+  cudaFree(d_var0);
+  cudaFree(d_pulse);
+  return 0;    
+} 
diff --git a/legacy/planar_complex.cu b/legacy/planar_complex.cu
new file mode 100644
index 0000000..3fb8175
--- /dev/null
+++ b/legacy/planar_complex.cu
@@ -0,0 +1,87 @@
+/*
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/core_io.h>
+
+int main() {
+
+  cutlass::half_t x = 2.25_hf;
+
+  std::cout << x << std::endl;
+
+  return 0;
+}
+*/
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/util/host_tensor.h>
+
+int main() {
+
+  // Define the GEMM operation
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,                           // ElementA
+    cutlass::layout::ColumnMajor,              // LayoutA
+    cutlass::half_t,                           // ElementB
+    cutlass::layout::ColumnMajor,              // LayoutB
+    cutlass::half_t,                           // ElementOutput
+    cutlass::layout::ColumnMajor,              // LayoutOutput
+    float,                                     // ElementAccumulator
+    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
+  >;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
+
+  cutlass::half_t const *ptrA = A.device_data();
+  cutlass::half_t const *ptrB = B.device_data();
+  cutlass::half_t const *ptrC = C.device_data();
+  cutlass::half_t       *ptrD = C.device_data();
+
+  int lda = A.device_ref().stride(0);
+  int ldb = B.device_ref().stride(0);
+  int ldc = C.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
+  //
+  // Launch GEMM on the device
+  //
+ 
+  status = gemm_op({
+    {M, N, K},
+    {ptrA, lda},            // TensorRef to A device tensor
+    {ptrB, ldb},            // TensorRef to B device tensor
+    {ptrC, ldc},            // TensorRef to C device tensor
+    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
+    {alpha, beta}           // epilogue operation arguments
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  } else {
+    std::cout << "CUTLASS Success! " << std::endl;
+  }
+  
+  return 0;
+}
diff --git a/legacy/planar_complex.cu~ b/legacy/planar_complex.cu~
new file mode 100644
index 0000000..db94a64
--- /dev/null
+++ b/legacy/planar_complex.cu~
@@ -0,0 +1,85 @@
+/*
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/core_io.h>
+
+int main() {
+
+  cutlass::half_t x = 2.25_hf;
+
+  std::cout << x << std::endl;
+
+  return 0;
+}
+*/
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/util/host_tensor.h>
+
+int main() {
+
+  // Define the GEMM operation
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,                           // ElementA
+    cutlass::layout::ColumnMajor,              // LayoutA
+    cutlass::half_t,                           // ElementB
+    cutlass::layout::ColumnMajor,              // LayoutB
+    cutlass::half_t,                           // ElementOutput
+    cutlass::layout::ColumnMajor,              // LayoutOutput
+    float,                                     // ElementAccumulator
+    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
+  >;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
+
+  cutlass::half_t const *ptrA = A.device_data();
+  cutlass::half_t const *ptrB = B.device_data();
+  cutlass::half_t const *ptrC = C.device_data();
+  cutlass::half_t       *ptrD = C.device_data();
+
+  int lda = A.device_ref().stride(0);
+  int ldb = B.device_ref().stride(0);
+  int ldc = C.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
+  //
+  // Launch GEMM on the device
+  //
+ 
+  status = gemm_op({
+    {M, N, K},
+    {ptrA, lda},            // TensorRef to A device tensor
+    {ptrB, ldb},            // TensorRef to B device tensor
+    {ptrC, ldc},            // TensorRef to C device tensor
+    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
+    {alpha, beta}           // epilogue operation arguments
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/legacy/spectrometer_header.txt b/legacy/spectrometer_header.txt
new file mode 100644
index 0000000..88a535c
--- /dev/null
+++ b/legacy/spectrometer_header.txt
@@ -0,0 +1,38 @@
+ACC_LEN      1                 
+BANDWIDTH    -250                   
+BW           -250                   
+CFREQ        1405                   
+CHAN_AV      0                      
+DEC          00:00:00.000           
+DSB          0                      
+FILE_SIZE    0                      
+FREQ         1405.000000            
+FSCRUNCH     1                      
+HDR_SIZE     4096                   
+HDR_VERSION  1.0                    
+INSTRUMENT   DSAX                   
+MODE         ACCUM                  
+NBEAM        1                      
+NBIT         8                       
+NCHAN        1024                   
+NDIM         1                      
+NPOL         1                      
+N_PROD       1                      
+OBSERVER     DSA                    
+OBS_OFFSET   0                      
+OBS_UNIT     SECONDS                
+OBS_VAL      0000.0000              
+PID          P000                   
+RA           00:00:00.000           
+RECEIVER     SANDY                  
+RESOLUTION   4096                   
+SOURCE       DSATEST                   
+STATE        Coherence              
+TELESCOPE    DSA110                 
+TSAMP        262.144                     
+TSCRUNCH     1                      
+TRANSFER_SIZE 256000000            
+NANT         5                     
+UTC_START    2015-08-07-17:07:28    
+FILE_NUMBER  0                      
+
diff --git a/legacy/splice_offline_beams b/legacy/splice_offline_beams
new file mode 100755
index 0000000000000000000000000000000000000000..728af8c0f771d3c851b2b05a00201bb8672e361d
GIT binary patch
literal 32432
zcmeHwdw5jUx%b*L37KSanGnDrSO*+1D7kUJh$aX)@kB%}iWM9tlSwi%nZ&t3LOC^D
zwB|r6X{qISYz5m>slCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R
zp7TA=_s_Rs_FC&*zxTS<UVG2p@9f;Mw0W7$Fa&RQaiyTv2A85V=#E1PT|v<x=7~b=
zJ>n8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmw<HiB
z+1hcDY@&3_3YQ199S3)+e&!=G@%3oF9?i$)X>Ayn>>v3?pVeBfwOoQe+PEYvZ7Y5s
zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r
z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl<XgaHqog6$;#NX+h1OL*+2cZ?^x~OS^b}T
z&2f%+`Ge};yln$=bPn`E(A+ub&7kGN*Ns4*HUhnT1p1XD(7Q&UPac8(H0W-8tlL}=
za@p@3fxdJE`Y%9t<73@cf<QJ|H$Mou^a{+208N8AqOGet7!f{SEbb3%^aVOL`r7=V
zun73W;jVy)#iL<#w*|sov7qSG#NJpu*eTjJM?>+TXp08@tq>6C@%!3B5q~(eB}nqR
zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8Tk<R;r$^&8V3zd`+vb^R)(}!S)bT
zMps{Vb+{`MT<vcOlZy7vu80=uQ_!41#DM0sP5!g%lG{|~VGi5z5rU7m`1nvEtqgZ*
z+L%zNgpPzS=yu{l>bHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y
zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+I<IZix6ed3umAf^
zbo2Um&_pNQyd5&p5$M!**hHsehqoaUy_kt8pEJ=hfT``6iOyqAo!%X9I=s8`6P0e^
zy>lpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IA<cVQyQ+`Wr3f;+Lvi&NxX$Vga
z$@b5wO(8saNVcD+Hihowe%XGC+7z;rdu01@YE!6A_R045s7)a{*)7`-Q=6v3$qlmo
zAhjt}Cs)Y!eblBYak3%B-?j&Bxi#L@c;k;y>uFrmxO&YhZ~wdhLAEy!#2nsjZ}z~>
zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~
zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr
z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3
zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF#
zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt
z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L
zca_wEE~QyS(Q%XfPr{^S<S7}8Q!>6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe
zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n
z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4;
z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs=
zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW}
zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~
zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT%
zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo
zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz<AR#q83L2Dm6}Z%)O76R
z`cR%J{#qse*Er6U7w?=QFG^;R44mMwh7QY)rfo;5nS4?9FB*;h<QmkM>{{&!ul;o6
zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn
zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2
ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieu<IyO8^6jzGk))Y4_
zAoHJ0{Lrfc;N9_Top*lR<4FAD$sI5(*$uVzZ#DJ5c&oSnx85CZoL;fIc_9CJTx;MQ
zwn`R@UI1oz?2Tn^@m|@*OQ>g2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^
zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i<T<;y|FrkX<W*kdS?`PI;yB9k-zilBi%H-}
z^7<Bi2$)FJdUq^3O0_j)PqBB`qUX^{{`GzM>4gi==D&!xLFV+qEb;CUh=e}`Uw@LW
zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF
z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j;
zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F
z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_
z4=KJRXf)R5=<SwWIjx(7u00&DhP!ha^!B^bpnJ4nYq)An25mGOG)@_GQ>H<*{wB{#
zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf
zj<B^EbIDsjB&WbLAJ#lwcHdCh(x>PvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI
zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1
zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@C<dmQLXBB4;&@qW#bUP%-
za3?RM&Rr$9XekX!isT!9XL$HeJJh<ot7LvA0x^SY<*gnE^fVA2t%)A&R(s|pp9U)p
zZ37nMj}aR+w7W{)E5xe#5OwY<Nuo~kf1XL@Zge3Akc$08&&mJ7R}xy`0sn!9YYS8Q
zU8JABz)G*dS3ZWUX!g`4uQXW^F<B8ID<&x`TC~_YlNF=b3Ky(cCv6$yIXRKO9_=cr
zAHj~w3_HMyK#(=XWDUM1CJ&uX$K*Kd<(Qmsz?v^oC)R(|DJE4WYkJTnV^S%tA)BOR
zSFz{hP59E1EC4{Ccj)_fQ~xXU?Yntk%3b;LR9a2ts>c4mHub;XJni@LUc{4swI|z8
zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP
zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e
zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj
z_jcp<VW6WPHzC{FL?qDRkHkczB@~Z|xV%}2!r$2~BB8bz>sY#c2^^5yX}X1h#qY}v
zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal
zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB
zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP<YguXKwPnuDjy`eKg$pm6>6y&@
z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~
z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8
zsMN98DK?<F0(}z^GZzJ8ajEMb+v3v7&)ebnK;Gigs@?gGrFD0WYAl_%qhM9(p#pn@
zQF^#wed$nv{R{c1HG*baR#Z9<u*TA=#ieerL(F0sFo>jB(;S$Gi|ZijCLK#tI=rOA
zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4<p=}9GNNoCX
zVCs2*e#|NOn7mNIQqKiwEufA2|8j1)i`x7gkG{XtwjxiJboYR^!+NsNH)7h}QiX8p
zQkx^j>NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!<c;
zQ~C#Wk`+s*Y;Vx=u!M5I<ir_Mwy)IvINW8M{m|n7uNC~f*edW|?f5rz`ER;>R+lg9
z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCS<Grv-R_8
z>Sj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8
z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>M<sDo1VsBH)bpk+CW<Y8H&L!d<nk_UB6?mv
z9v}*PALf4Ev=xML<iCoMw@(8wZ#wC)mr`~209CgV=Kxjn9itvYQQ$a={iq*eXER16
zfRu#74`9df{J*S1T#6<^c>epJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd
zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kY<Qg&Br9JwRyhBel#G&%
zwa&kTt@#e)URbyRU#`$7GVVvC!}%83Rcw&K-Oi09%_SRg=W^0B+IRpxJ<eIw7-vuz
z`<zdZv<r-z(Ae&riG6;h@gSt_a0W@hMB{sC-0fURJX4H^0oda_Oy*P@6vlndA5o)5
zHugJd%IDY0#zE&#s8J^yhnz1_V}?-&p2N-<iJmEK8*;uuJ+lp3wVoFCv*;RC@C(3-
zPY~+9!tX$JW&VEhw`db#4^S1OL){OO=Z>NU<i$70aTi852wH)IWRx7mjyx!(2|?6b
z(U8>Vm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+<w+8M
zx2(PeD+~H%wGu)d7m{Hm9{@EEBQ#n#s!^0*1s#qFM8VYkr7Q|4zS6ojw581jI4bf^
z1L|^OMiicuL351*p=h%0PM``LG_T4JL6OrVtItq%2I(x{085;6Wpx%+7gE)J6?Vmy
zw^Fqk)Z+1U%-XlZm*NYzQ}uF)FP^X$)e2Y1FH6udW(|NPugGd4+9k&?BI;PG{#P~j
zqU8H9hKSPaGEwpr+SH@nDESH6i@0qUg_V%#D*45YAQg>*!je~i2Lxh5pz=S#iIU$@
zXZZ%~O5UJq3Hi4T6)DpY<r@)`(g$uQ6rD3l_e)owhn&&_bg-9q67?GrV591HCEx`D
zJVF2&>|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS
zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z
zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf
z+{47z$Rv$CcEC*{#n^o&cE^L7_<v07N!I=#;LLu8iI-x+loctD*0Q1p$WBc>!N^sl
zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r
zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u
z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f!
zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U
z$8Uj8#<JsRV(YoEB^*mN_vcS|oqDe!G2cb^geS2xnyC2!Y7>#8B#djxirWk<XMxUE
zVYK`^dhLU!dq4Fofm>dX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^
zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa
zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu
z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm
z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x
z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#<eyvBw3alQ|#;7Gy8Z?H(_*x}3
z()n(ZwO?spW0HL>@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY
zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0p<G5C50U-k_G(^v(7
znPlx(wx2RtuoL*~$d$oz+Gu2D<bGg@+|<i55qX1WdPdQp>XBEzFev3q*B48)FZZx7
zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D
zQi63zj>}9`<hU?6?aP$}CSQEOOtSVXUyhjUm<W8fFTXP(<hZ<O_2pb9B5yEh@nsw4
zPd<I|!WL15FmAsbi_nyP=-NfT450E|_JzUC4E7Wen0%oV(IjiX@}*v@pc$0R>B|i!
zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e
z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq
z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV
zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%<P9FralAaDK`CDn
zw~U5)biDjM%#(9$Kl{SqlrNqI1SVfzAYZKg%9qznc2okN?aNscLi+M&t1shloblKK
zkvDj^WscGDa*+nLE37a!U0R`iIm5n8z@?fT7Y3(&8AV|7<r-inS^JeQO(r|u$mvVS
zgpj^`X!T`hCL(XJ&*BRmFAPfgviV&ykIt1bm?w{ye`H@6obu(epR+H|kT2GL<;x+H
z9fyF=9+zV#g!E<D>dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP
zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H`
zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4<yGoMxXP4QsZFR%onpC4RacU9L2WOZuC7u)
z%jSrxf6!tp+;|oMfpE>f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp>
z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K
zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d
zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM
zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu
zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz
za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2<gc>t0xyC0<`H-vkk7u@}y2f
z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PX<f~>B(xVCkHbT
zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMeP<pcN%V$2>u^C2|l
zc&*ori<oilEHFwhpC>P^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NR<fRF<b5Ut^~Qjq
z)eg*$nYwOdX7hBM&P$H+d|?4R-$Um!+R~oWNmY4XL!JwqxRmELN714$Jks-@XvUQ1
zbv0m=o^J<clC@uXPN!Si$n$r!k>vS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A
z>3Qw1VSsC|L03X`a)_(Q(K<Q2Ri8ms)Y8uN1>@@}$YcrGi<lXcX>}IWf5PN&eendY
zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f)
z6u)}lRU|~!WKE~|t<v<tjG430gs{Z#MpM@wW|r~O({B3h8lmF1fjqz0coNXMAnL}L
zAHt+d=qHQDI2phC4-u&Hg^z)3r+aL5HUvPm>z0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr
zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*<JXXAKZSN3
z-G{YZ{4wsB%@K2HfcS}QH+pF^?tdFBGWy+!{=CP4u?@CX4d_uSa0_t&2#)u+HJ##p
zMAI|kUG<C!aU#X3j@IcsYh*m<(nRdj5gwH0kU_8FF}^%G{0s)kMYhJJ=gwC(coBm=
z4-=Ty)*FDCWbIdT=WCjaeCY(9eR)dL$(MUHJ;Rr&ubB{*x%0lM>jh@cSar(vp{;zS
z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR-
zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e
z0<jOD(SmAMG-58(r(n13idQhB=~K~XEIf{2{S=?kf@)XbtReGW#!RqXu>f%Tj482S
zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt
z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH
zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$<cgtQXW6
zSB+O*UOK+1qTCo=&`|m1#pB5kJW6K&ZHUbHw2+Ai-i2s7RCK60A>B*hiRoTyUSw)Q
z{ijS|G%t1<Yc3#-ZYM@$bOBW-sVW{#IPrYqk~BXK^^1&pY3W+$%`(PQ@<n|GshQfp
z-GGF8@L&25+mTjUIk*_|tC&oJrxEu`C}i%-NIP!Z&w>zqcL7!?5!L3{WJIP0k<q@B
z+FCOZK0r)eI(Arr&HYUkZs#l+-x(=#KE<BNxKS#@j?u<pp*6|I?6g`e*l<kDt<j-K
z4ca_%gNAs1ji9*^DOJKEn9vI{NSI*@GYN#bB7=k(vM5b}!mB~CiJxLf4C8ER3HAo+
zg4(r-ShO}234{}^!CJi9!8d1?e$PXDBvC8VXe9(czOQW$1hOG>nOl6as9MU_jb#8c
z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG
z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@
zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS)
z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cI<E**lyqA
zSgTrhJ07rIHg>x`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H
z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2
z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6
zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb
z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql<p?O7
zF)5UBu}?;58)&S}{#k7tvHz1bd@|y1w81(CztLmh_yZYlzwL4R1jpk$cG|DQ;9?T|
z(s%=t;ya&p?6G&_Q(<eOXxZMe*FG+1RV)~3^|f{-<eNj9uJ_SPOMKn&D8JJL?<t}8
zlK5lsIvnI%g1)%$#X4|gw<Ox|su5Zs<`K<Dohb0|-Ul%=eD(OT1eS*=y@BL<AJD!Q
zKVFqm+tJk-tW`OgwOBH4t8I<>>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j*
z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhh<j2r-j@j7>of%zh7nf8u!)p^(qzAUES%A
z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=<M3m
zLTvtE410og5U4v4?o5!D?yl}YBu;>qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm
zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh
z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye
z{5aVe2)Bh3m|RkvluSm+3dK<gYP~&GZINb4wVNocDe<9HAQjK1RQyzX7W%vO&}ho0
zNF+q!0^!XqN>t}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^
znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~
zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D
zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d
zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb
zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u%
zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdm<PK
z^!noJwM`JDS4}KmhzqY|nXin(!Qqc$Bz^MrQa-#@${!OThG--qLis_X6zNVN{flU%
z9r5L%@JD)60T3ZtN+AxNz644Po?Om@kxd~?FcC<EF}4(Yxz^8_g<kz?tnMzvIemQz
zzvR$>NE0s5)fH{kbp#(HE>eNRIVCDu<Qc>Kqc{~_niWz1W}o~49L2pw|Jn_2%fkJu
z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ<
z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR
zU=PiGL|aCbESCqQJV&b2u@8%50K<bppl&(I)bZNZNxa?u)_PT7N>OX3DrTu-cKU~l
zzC<Ju3%1UngNBU31llAzgPnAg;^_DHhC}Tg@CP%Yolc_AMu%#mTTSYWA1H?DLO}lR
z5JPWz+ZkkHJmdkSz8Co7bR1$J=>$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx
zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W
zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~%
z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe<MUw7
zV<V?}=CqOd7?dw4{WHr<E6gV9KM{lQTD|4R*E~GrE!SU7jozoZ+NK?b<=^h*#vQ3q
z_%v5}U>$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N
z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V<?P5Tr7V&2DbI&(!p$n$
z=wUr=(2hJrRZs6(D`yZwBIh$^X=jO?0eL>`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M
zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO
zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2
z4iY(sE=xO0<gB~%VPDo-U)5!f5oeX0)&_B$@|LBV#c`6{`LIiH3z#;qE_0}6m7Eg>
zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQ<kNjC2|(p`LHanR-XWX<z%od
zxtzAeSy{kx_SJ~k_w;F+R<7O$xWy6-u|Dmhs&P`n1f5r82tQXjMP!tC*Z!CCX%BF_
z3(q2G1C*<r)WJ{%Nbj??wB*z>I+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{
zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q
zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^
z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ
zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT
z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCX<TSd&z|xy4C6f0KB>icfma`AMwS
zWC5od<q*sd-*)Tl7;tg*)f8zJS6|a4WjD9j_Z;Fmi{`;p4AyYfnHu3$TrJilC7xU5
zX<*u0Rm&Ei6&6mtVYP6OtDHrX!*0%qX-Mf_!_{x5NUON|aE9I7;@p$JNqp8c<TM&q
z3%j|>88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2
zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h
z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x
z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms
zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-<
z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9
zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y<Ly*hgCxAzKTGqV!
zsg#pFuU^r3{yP@3iQOM%C3~L#6?7xVJgh)KM#J9*eZ25;bBUz0)O0x;^m4?r!R?kr
zWZTH?YL=rfb0}W4U8m{1nszI)SO-NUhhJ2|hyBgKyEVpK(EDQCIgXbE@Fa&{<$$K#
zK7)!rk(<Sc&ucut4}$IAtLbaqZi#1m?$h)Q7W&sTonMu}_=h#UTjN>&A<$j0-=*nn
z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ
zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_
z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_
zXdI7O<h-Qm$1U{NHT?w3?^7DyWcq%$B=C6sS=0GFFHHXkbcIy>ye~wSAK7!-BBzY#
zNCb7TUd=F3;?w7A_Um%cX<gzkcKlPfxmx}ii~Oq?ufL=QwfyTPovllk0WC)ya7*F}
z-SuOXF4*sSP|-D6bZPvG1B%XZ_&3CX?BuJ$%QZtE@KTP&-`$`Sp08(ITF{`Dzd_4q
zd%hy^TBxZwr16I|p6z)|(+^wdk4rhFS-%x{R^x{>-lcW?Leq~~<h(BBXppHmO?Zs2
zb$yF#Imfje9^a2acboXRIt~DuKd)Qx^b#wgpRmxUfj*g;O=Xj`GyD2;sg`qE%h9Jj
zK`&z?`TUY2j^|CF=UU%JN{$Ug@gCenTO0CZ(|-la7w`1pF?$5rxOl4F<!cXjwIF?<
z71=T|pFhzf0$rWm;b0sIsdMMdpO+g%f2u3w^GBopULQU4??uWOUfAnvO>}l5Z31Z*
zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;?
zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D
zf6%EI=lcU<DkVhbOk@gHnK3FACiS3RXVj>-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yu<Y
zW@n^vvohb7tWprt-Q>MWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g
zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ
z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0<qono5V%x6hfD*q2%|(8
znIj`(s`E-S61!5b(^h{bFO%Qr!=JL_!-xLHUCuvTr*lOzGJ!Jw%wDF@R0>n-1>PAc
zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f
z@W(pv>+s$P<f#%#wX$bZFe<Zp%ncv<qQS7A7<8>W92Yfm*lJL%Y41WaPH97Uxp*W5
zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1<v$
zp-5X7*ZnOm(cmU-QC1_@5hbot=B?({l1%4}|4o&?<{9|lazJi;pNC8Q<Sb(&ta-Zx
z4Z0=5{Cq!%OSc(}I!;R&TRmE4e!g$Sr6&WO&A$Zwc*S`a>))p<T(bT7S`MEdSuB0e
zrbqwGe^^(z+@~F;hm}-iem?)+2pUe=DSm!`GM8_m&%9aVM|mLhE@IZt_qDj3teM$<
z#&Nk3{q$~QZu9*wE*mtz)qYJDTTr1fVSc_(#^sO(vVN=oJ2n4ejicYm(#9pf@0s}N
z8IYC#^Po{o_+c==@0v?}Sqy~Q$X4^#kA|6_@6T~LWCo+o*NI&2wea(OJTBQ@7Q<~W
zzhdF%>tinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L
zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`<pLJh!?0C3q;r%+G%ZwfiEH
zmY#2_m2NZtZ_q(=kNNpNDF1!u)B1qr`N@1dj{Cq%5a#Fmr5*YfBlVj%=I4^`dztxD
z-=E!zi}kP!=HvEhREWm<-EOz+a;ZX`-X@!=ohGN+e?g7(*QF?`!u-=Lbi&fc<vCMb
zcIe%Fx2iPgHW$pt?Gg~IH0tYhE5r(2s#Z33{0RI%ZcrREI51j(RoA5>@SnX}F<iq7
z-w6COHNQ1}i<=a~0n31~elD*ZLI3Y=Q2bYwYsDkzZyJGrazHU0%Vf`rHQ8Y~L?y==
z&7Y)`Beu!-%)p1;sU#w{LVPSCmh=-{vJg?rg28^PUi$8un}4NVcy6*VSeoL00P(di
AKmY&$

literal 0
HcmV?d00001

diff --git a/legacy/splice_offline_beams.c b/legacy/splice_offline_beams.c
new file mode 100644
index 0000000..a70a258
--- /dev/null
+++ b/legacy/splice_offline_beams.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/syscall.h>
+#include <syslog.h>
+
+#include <src/sigproc.h>
+#include <src/header.h>
+
+FILE *output;
+
+void send_string(char *string) /* includefile */
+{
+  int len;
+  len=strlen(string);
+  fwrite(&len, sizeof(int), 1, output);
+  fwrite(string, sizeof(char), len, output);
+}
+
+void send_float(char *name,float floating_point) /* includefile */
+{
+  send_string(name);
+  fwrite(&floating_point,sizeof(float),1,output);
+}
+
+void send_double (char *name, double double_precision) /* includefile */
+{
+  send_string(name);
+  fwrite(&double_precision,sizeof(double),1,output);
+}
+
+void send_int(char *name, int integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(int),1,output);
+}
+
+void send_char(char *name, char integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(char),1,output);
+}
+
+
+void send_long(char *name, long integer) /* includefile */
+{
+  send_string(name);
+  fwrite(&integer,sizeof(long),1,output);
+}
+
+void send_coords(double raj, double dej, double az, double za) /*includefile*/
+{
+  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
+  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
+  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
+  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
+}
+
+int main(int argc, char * argv[]) {
+
+  // memory
+  uint64_t bsize = 2013265920, bls = 94371840;
+  unsigned char * allbeams = (unsigned char *)malloc(sizeof(unsigned char)*bsize);  
+  memset(allbeams,0,bsize);
+  unsigned char * data = (unsigned char *)malloc(sizeof(unsigned char)*bls);  
+  FILE *fin;
+  
+  // load in data if present
+  for (int i=0;i<16;i++) {
+
+    if (strcmp(argv[i+1],"none")!=0) {
+    
+      fin=fopen(argv[i+1],"rb");
+      fread(data,sizeof(unsigned char),bls,fin);
+      fclose(fin);      
+      
+      for (int ibeam=0;ibeam<256;ibeam++) {
+	for (int itime=0;itime<15*512;itime++) {
+	  for (int ich=0;ich<48;ich++) {
+	    allbeams[ibeam*15*512*1024 + itime*1024 + i*48 + ich + 128] = data[itime*256*48 + ibeam*48 + ich];
+	  }
+	}
+      }
+    }
+    
+  }
+
+  // make files
+
+  char cmd[300], foutnam[400];
+  sprintf(cmd,"mkdir -p %s_%s",argv[17],argv[18]);
+  system(cmd);
+
+  for (int i=0;i<256;i++) {
+	  
+    sprintf(foutnam,"%s_%s/%s_%d.fil",argv[17],argv[18],argv[18],i);
+    output = fopen(foutnam,"wb");
+    
+    send_string("HEADER_START");
+    send_string("source_name");
+    send_string(argv[18]);
+    send_int("machine_id",1);
+    send_int("telescope_id",82);
+    send_int("data_type",1); // filterbank data
+    send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
+    send_double("foff",-0.244140625);
+    send_int("nchans",1024);
+    send_int("nbits",8);
+    send_double("tstart",55000.0);
+    send_double("tsamp",8.192e-6*8.*4.);
+    send_int("nifs",1);
+    send_string("HEADER_END");
+	  
+    fwrite(allbeams + i*15*512*1024,sizeof(unsigned char),15*512*1024,output);
+	  
+    fclose(output);
+	  
+  }
+
+  
+  free(allbeams);
+  free(data);
+
+}
diff --git a/legacy/test_read.c b/legacy/test_read.c
new file mode 100644
index 0000000..2b5730a
--- /dev/null
+++ b/legacy/test_read.c
@@ -0,0 +1,279 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#define S 4096
+
+/* global variables */
+int DEBUG = 0;
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -b connect to bf hdu\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("test_read", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      return EXIT_FAILURE;
+    }
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer;
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+    for (int i=0;i<S;i++)
+      syslog(LOG_INFO,"TEST %d %hi",i,block[i]);
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  
+}
+
+
diff --git a/legacy/test_write.c b/legacy/test_write.c
new file mode 100644
index 0000000..32dd25d
--- /dev/null
+++ b/legacy/test_write.c
@@ -0,0 +1,452 @@
+/* will reorder raw data for input to xgpu */
+#define __USE_GNU
+#define _GNU_SOURCE
+#include <sched.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <syslog.h>
+
+
+#include "sock.h"
+#include "tmutil.h"
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_capture.h"
+#include "dsaX_def.h"
+
+#include <x86intrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#define S 4096
+
+// data to pass to threads
+struct data {
+  char * in;
+  int n_threads;
+  int thread_id;
+  ipcio_t * out;
+};
+
+/* global variables */
+int DEBUG = 0;
+int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
+int dada_bind_thread_to_core (int core);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
+{
+
+  if (write==0) {
+  
+    if (dada_hdu_unlock_read (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock read on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+
+  if (write==1) {
+
+    if (dada_hdu_unlock_write (in) < 0)
+      {
+	syslog(LOG_ERR, "could not unlock write on hdu_in");
+      }
+    dada_hdu_destroy (in);
+
+  }
+  
+}
+
+void usage()
+{
+  fprintf (stdout,
+	   "dsaX_reorder_raw [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -t number of threads [default 4]\n"
+	   " -b connect to bf hdu\n"
+	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
+	   " -o output key [default REORDER_BLOCK_KEY]\n"
+	   " -q quitting after testing\n"
+	   " -h print usage\n");
+}
+
+/* thread for data massaging */
+void * massage(void *args) {
+
+  // basic stuff
+  struct data *d = args;
+  int thread_id = d->thread_id;
+
+  
+  // set affinity
+  const pthread_t pid = pthread_self();
+  const int core_id = cores[thread_id];
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(core_id, &cpuset);
+  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (set_result != 0)
+    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
+  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
+  if (get_affinity != 0) 
+    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
+  if (CPU_ISSET(core_id, &cpuset))
+    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
+
+  // extract from input data structure
+  char *in = (char *)d->in;
+  //char *out = (char *)d->out;
+  int nthreads = d->n_threads;  
+  
+  // place in out
+  int i = thread_id*(S/nthreads);
+  //syslog(LOG_INFO,"thread %d: %d",thread_id,i);
+  memcpy (d->out->curbuf + i, in + i, S/nthreads);  
+  
+  /* return 0 */
+  int thread_result = 0;
+  pthread_exit((void *) &thread_result);
+  
+}
+
+
+// MAIN
+
+int main (int argc, char *argv[]) {
+  
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("test_write", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+
+  // TESTING and initialization
+  // threads
+  struct data args[16];
+  pthread_t threads[16];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  void* result=0;
+  
+  /* DADA Header plus Data Unit */
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+  dada_hdu_t* hdu_out2 = 0;
+
+  // data block HDU keys
+  key_t in_key = CAPTURED_BLOCK_KEY;
+  key_t out_key = REORDER_BLOCK_KEY;
+  key_t out_key2 = REORDER_BLOCK_KEY2;
+  
+  // command line arguments
+  int core = -1;
+  int nthreads = 1;
+  int bf = 0;
+  int arg = 0;
+  
+  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }	  
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+	    {
+	      nthreads = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+
+	case 'd':
+	  DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+
+	case 'q':
+	  syslog (LOG_INFO, "Quit here");
+	  return EXIT_SUCCESS;
+	  
+	case 'b':
+	  bf=1;
+	  syslog (LOG_INFO, "Will write to bf dada hdu");
+	  break;
+
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+
+  // Bind to cpu core
+  if (core >= 0)
+    {
+      if (dada_bind_thread_to_core(core) < 0)
+	syslog(LOG_ERR,"failed to bind to core %d", core);
+      syslog(LOG_NOTICE,"bound to core %d", core);
+    }
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+  
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+      
+      
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in,0);
+      dsaX_dbgpu_cleanup (hdu_out,1);
+      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+      return EXIT_FAILURE;
+    }
+
+  if (bf) {
+    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
+    if (!header_out)
+      {
+	syslog(LOG_ERR, "could not get next header2 block [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+    memcpy (header_out, header_in, header_size);
+    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
+      {
+	syslog (LOG_ERR, "could not mark header block2 filled [output]");
+	dsaX_dbgpu_cleanup (hdu_in,0);
+	dsaX_dbgpu_cleanup (hdu_out,1);
+	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
+	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+	return EXIT_FAILURE;
+      }
+  }
+
+  
+  // record STATE info
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  uint64_t  bytes_read = 0;
+  char * block, * output_buffer, * blockie;
+  output_buffer = (char *)malloc(sizeof(char)*block_out);
+  memset(output_buffer,1,block_out);
+  uint64_t written, block_id;
+
+  // set up
+
+  int observation_complete=0;
+  int blocks = 0;
+  int started = 0;
+
+
+  
+  syslog(LOG_INFO, "starting observation");
+
+  while (!observation_complete) {
+
+    // open block
+    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    if (started==0) {
+      syslog(LOG_INFO,"now in RUN state");
+      started=1;
+    }
+
+    // DO STUFF
+
+    // sort out write
+    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 0;      
+    //blockie = ipcio_open_block_write (hdu_out->data_block, &block_id);
+    
+    // set up data structure
+    for (int i=0; i<nthreads; i++) {
+      args[i].in = output_buffer;
+      args[i].n_threads = nthreads;
+      args[i].thread_id = i;
+      args[i].out = hdu_out->data_block;
+    }
+
+    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
+    
+    for(int i=0; i<nthreads; i++){
+      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
+ 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
+      }
+    }
+
+    pthread_attr_destroy(&attr);
+    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
+    
+    for(int i=0; i<nthreads; i++){
+      pthread_join(threads[i], &result);
+      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
+    }
+    
+    // write to output
+
+    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
+    
+    // finish write
+    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
+    ipcio_check_pending_sod (hdu_out->data_block);
+    hdu_out->data_block->marked_filled = 1;      
+    //ipcio_close_block_write(hdu_out->data_block, block_out);
+    
+    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
+    blocks++;
+    
+
+    if (bytes_read < block_size)
+      observation_complete = 1;
+
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+
+  }
+
+  free(output_buffer);
+
+  dsaX_dbgpu_cleanup (hdu_in,0);
+  dsaX_dbgpu_cleanup (hdu_out,1);
+  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
+  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
+  
+}
+
+

From 5e0ea2657f33084cd2910db09802af99c81bdc9d Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:42:49 -0700
Subject: [PATCH 08/30] Move headers

---
 {src => include}/dsaX_capture.h            | 0
 {src => include}/dsaX_capture_manythread.h | 0
 {src => include}/dsaX_capture_pcap.h       | 0
 {src => include}/dsaX_def.h                | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {src => include}/dsaX_capture.h (100%)
 rename {src => include}/dsaX_capture_manythread.h (100%)
 rename {src => include}/dsaX_capture_pcap.h (100%)
 rename {src => include}/dsaX_def.h (100%)

diff --git a/src/dsaX_capture.h b/include/dsaX_capture.h
similarity index 100%
rename from src/dsaX_capture.h
rename to include/dsaX_capture.h
diff --git a/src/dsaX_capture_manythread.h b/include/dsaX_capture_manythread.h
similarity index 100%
rename from src/dsaX_capture_manythread.h
rename to include/dsaX_capture_manythread.h
diff --git a/src/dsaX_capture_pcap.h b/include/dsaX_capture_pcap.h
similarity index 100%
rename from src/dsaX_capture_pcap.h
rename to include/dsaX_capture_pcap.h
diff --git a/src/dsaX_def.h b/include/dsaX_def.h
similarity index 100%
rename from src/dsaX_def.h
rename to include/dsaX_def.h

From 7aca2bcd08885a4485b09da53d1d063bcb038c9f Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:44:27 -0700
Subject: [PATCH 09/30] Remove executables

---
 legacy/cuda_correlator        | Bin 34272 -> 0 bytes
 legacy/dsaX_beamformer_passon | Bin 178600 -> 0 bytes
 legacy/dsaX_wrangle           | Bin 99600 -> 0 bytes
 legacy/splice_offline_beams   | Bin 32432 -> 0 bytes
 4 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100755 legacy/cuda_correlator
 delete mode 100755 legacy/dsaX_beamformer_passon
 delete mode 100755 legacy/dsaX_wrangle
 delete mode 100755 legacy/splice_offline_beams

diff --git a/legacy/cuda_correlator b/legacy/cuda_correlator
deleted file mode 100755
index a8b94c759c2da5b87ab4c1a740138d0ad7d75073..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 34272
zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk
zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(`
z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J
zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9
zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N<lS
zw^^lcwtK*l_mGGddCX9qZO03V0OR{sngaV)dSpb?fC&ptwca(*V>#bzs8E6%Repv1
z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU
zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z!
z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr<HAN|f%|L7Z)-}u5Cm6x<W+YZr!p&kXR%#%95
z0G=#_qoDch+*3gQZwtu(Gs=?BPO<<#r+}T?3gF)=P@eGu_CH+!FDhVvpn&}^6tJ_Z
zfc)JB@Wul9_7%XdD`4jf1>`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$?<fPb_A
z?kHe?V*z}90Xq*DsPC->?970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1
zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq(
zZnmAPm7Rxk^4+iSF@<aUFG)K>3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{
zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA
z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`}
zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M(
z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOq<r-
zbaUvITf)J%$WYQW)(od<YbcdEd!iREy~)=fOe8{sK^8Zj2=rcd8BcA4@Lj)tQ=%~x
zjr)51@!%2~Z>ekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQp<qNrhY}$$
zC?XmGn+S>m1z(>(6sC{2`+fdIa68<A0zw#ApT$%k3Lg)Ocp{dFCc|RTY#)&G$NGW7
zGcnP3b1al#D~wwdA;L7Dj|EmubI>1(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb>
zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR
z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw
zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=>
zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa
z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5;
zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj
z5t)3_f}d}Zf760rV8MmDj<Su+wct(*ZdmXN3qH?+S6T22Ex2L9FS6h@7QEVm*IMue
z7QE4dUu?lW7W`5R?zP}@cvETKk(Zs`-No;$GK6>6SfV(!8@<vy@~HEuDKN406p|+9
z{~pJ3+>=bqCqKZ%@e>mW^Is&-E<S!t^3RiJ*B&2}{7=cVOOHPw`De(pE06D&{8QxF
zg~#_w{=4Mab;pM#e~3I!$M}%szd@c|b-Y{hUnS2jI^H4qFOX-~9QR26GvwJN$7>~j
zFL`#waYOQVlV=wkuaNv5<k|Jch2(D`&n`E9@?8M)W8~S@#!pE8M)K@p<Hsc5L%x#y
znB=b~KZE=OlHW+4U1@y3<gX&nE;PPZ@~g?S>x>Ueeg*k+$qz}so;<tIc(>#)CC{!i
z-XZx5$+OFhdnA7zd3KfYTFF<EXBQbaBwt3JU1PjL@^<nUkQb8w<CWmq6~<4#!}^~h
zZ;(GB`4`Ev>5m_i{PX15<j2P(|5Ng8>f;Yc{u%OY;^X@z{}lQ8<o8PcyX4u_$A=}~
zbA=)L_JWt8>AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp
z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@
zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc
zF7>X_c<N8YAA4!U#FuOQn8Y6)AAs8}_Gany2arBmR6Xv^PJasyljYfq^j%|3qmP$=
zd_F|HyU)4}Nr%{$q?3;P2z5=}g0p;(o)|w|B<u5#*6)z|sFAb$<G+Cv3|v72Y%HdM
zAJD+^w1E%9K>6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m
zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL)
z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKT<n;^<?9r8-JL74Pl`
zrDgBv6JBK8=!JVKQom&-b|<PUyeKurQ`HiA&@{d0)@rZU_Ihd&OnP^JLgn|Mlza$z
z!m5;bCbbn3ExS>nJs+|?{ZecB{oiXDeX^<i{wL)B<K8`&{~Y2XbvDiK8e{HN<vU{-
zUYkb#WGmlUjo1-kctq2l8zxe#D77b1-C?@?B#gK9KCW6kbq%I=%hcAoiPR4vkUoEj
zYH!Q(w_8S^*B)#tZ+$lPFhq8ZjXqYsYZJO&)5sIH*5&UepQfE5Gncy|;oaR*J><Qt
zrMkPX{C8vc`yT#&<_(RBr#`ci_`s>A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M
z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^?
z<Vf*a`Ss77Dxdy=>RQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9?
z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%;
zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JY<M{hI
ziv9!8^WVm$to*KSl=06x$$-3c(h=yuNi(Zc@4O*>G_yM6A|sEBegd(kk(X`Bt6R$N
zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm
z2IIz;q{gj~dK*%@MW*gUKC<tmJ^_Kzlo!2}!+gNIc$ysd%XjUD3%$2L%B#VatxX?j
z+S+tOlkezmoH%-vCv4aMMK?L}5{ELE%trQO=CulWjsI_?(pBE4Q?g0DR^Ii@_!s3M
zbCw$=Jyeez%;B(+<IknAA#X({DSZO%B=t6myf<@w*mPugC6tmtZxj!mjh=g>{D<+S
zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d
zOWpta#6;Ki57jEGkjs+UL7Cl<dH)R$U0p!t7RvNP#@E#KP@sU&E()!NP}BRHu7BwE
z0#Z9EHHT7Lx*oc(fY1nq-uMH`{m^$HB8&XX>s9qs-R_{wk066`A1`3%cFNoj8J7DG
z1%!qvbT@=#xhrv<Vn(@c`Us^0ltQ@|6%e|OLaQJIf43Kq`7mW>Lk8sz6_B}=GH?9e
zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc
z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@
zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k
zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^s<wvvsI5oM{D-f&
zXNiI7jp?0GOhwR@wmx2b8z9?jGN&Uauz=?sStXLiFZ~bbrH#Exg^6QZ$8Uj7raW+D
z8??uN^BOOesQJjt1KvID)lOakaXGye_W+~M<L0sg1C9WlNou(d#mFviLaS7|p(t-G
zDbL|cq`m~F<8f>jFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+
z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL
zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi%
zuO<kl9<inrsg#SE@)hQW`y^ieQ2Ci=<*&ss&LVDs18~Q5#n<aG_@0DinN{izSitRi
z;$kFy3P~L7{^ih%>GU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9
z@G$76w$azyMxSlJ<OTW6{Kz|w)RB{{&MOI{?q|CFwvWEpKKeRtY?~%#KIa`dYV$6C
zDfuEhlI7}S8UJ59nrU0IJ?$xey3UZDW2Sd^McwP;H@<3Zf4s3DzYY+c&u4d7K8m;v
zb9K!tdRGKi^sN|(u874~Y+E6sNz9ZQz0qVOVGJhY38N=y3`OIiL}**EOmG%G6zMmb
zJ35W7g{|gPwFcK(LChpC?oIa8)wK=|C3cvT<89koTer!{<c7pR48|McJFt8(xVU#{
zXz`Fg*2;NimDI>Ne@y)M<R;YB4~6}`!GUNP)4lO}U!7GeH?jV7iHlE{xa4$+OHY^B
zaJs~0r%P-+UE=b>iHm)8wbs<d@!$uod_y3(Ez}!q#W`o_jQjh8D~$PZV+-fwqmlU5
zvJGZrMVVnN>^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME(
zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-*
zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZ<msbM|lXm9g}
zlfk&*j|J(x0-@HZiA0S!N_$CRp*o|65(bt_j6ia5=rp4BN^~HK9tt06Z(*q=-WQCW
zMsu;!yh$!sp`#CCF-9(k#i_M0mRke8?q<lm;HPLWmS_UTU~n)R+hJ_!^M~U>o}I~|
zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%&
z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc
z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp<Z^GIUZS;k<qufJL
zh#FjA;)z1jcFRtP16HKaUX??rfUy@R1MRR6=g?qdW{1lx3YzBSvY%yf&xi#RNvu5v
z4AWq5G!QiA2g*ci*gq5p%!NK<zPq8%7-TO__`~nh>>}Au{M*ohOdqgpn+LGU7)*qa
zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd
zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~<Vo^;+;g*-
zFHiNFP&mjzH)3$v77Z*EG327FZzzeeAkLP{CeiUq=OLS@5GV1}z+e|Pn+Nm(#S>#F
z!JWv}W>g-hTjJK}N6g<DE-q1^T^trsi-1$veUi7s2v-+hEj*$TtC$|FS^{*(2e@?X
z??Ft{9fKw>sV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_
zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t
zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m
zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|<IG_McwI0htz4scddd%~^
z2Cb+2e;1|#8Fc?YU|qZh{-4$Vg|@4%N7nv7?XT<q_$^ApeVRuZJyQHRnfL(i4+DQr
zsy`g<@rUK8(~G<Egg=s~J*|$s)$#Sk{P-o09H_E@|LEX#x_bXVam;(a!I&g@<`L6;
zTK+G8#AJ_nPmR~NX=CeJ;m&HYyw+v`IZap&BLxrZkm3-k^IN1iKj+|xn-h+Lw*l8<
zBF9IDSMWz%|At57T>B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp
z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^
zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_*
zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUv<o$
z5Jhu}aA$0Ws}&XhP44f!sy=K7^wXK}-@^m;ThOD~{dhljtX!<yq3bp55HFUCwcs(k
zjWFU6W96bMoesJG1YYO%*HVs0dTh1Y@F?b8D~PxrzckG8I-fqrc?SN@-x_lD3q3bD
z)xe_P|DqZG*h`P!`nln0V7WG#UzHl5^|(chAy@Vi6V4SAcPaf^Gd*2LJ>Dr;Y$~9C
zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ
zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W)
zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t
zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm
zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$
z3ruwbqH<b;m|Nc<E}XlVhb25L<)MLxWjr+Uu)IMS8@N=nOjMSC&3<l=IceW1PTI=c
z+d}aWUY;@Bm?n+wIM;E~Rbch3rmXCevJc(v*pC_7m1Q4BmBrVad)DwASG~_;@h1E|
z+p#hLgyYusA5Tm?0P6hbi3u(vAOHEp!~kg52`oqA{NDvSjP$#HH8F7<bRXzR(5~kv
zCc2S6@au_*yFibF9s^B*ra&1uRx#La*(hw=D{Ql;l{oj>N-BuUF#)zOl@jJ5zq}CD
zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG)
zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E<
zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC
z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T
zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy
zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ
z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY
z0F86c39B6UsIc<fQ-VRy-RS*q^t7hl$7Yr(mfmNLFC;VQ`EdP?|8cyI!Jy}#^*rPs
z-kpdN-)Hxbq}O>*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT;
z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwf<mkA)V=7b$tHy-41v(iQw)PV_!@
zxfv~}@JOgZ(KU)*t7xC1+Z4T1(N8J*WktWO=;Mn1MA6?W`nsZ1rknYlqv!%f8x&om
z=(URWDY{M3I~DzuqF+|@+loG}=uZ^=t)j0hI;B$Oujm3r8x&om=(URWDY{M3I~A2q
zvmDkoH?J^iI(zU+M$%a7UgEA@*pQT5-EDOZ?%E~py827Z1f#yTuA#P}cDZ;T44D1Q
zY*r`n{uYCrsFShmLl;`3*^r#JDkx6lVqX$6`4sIo=ckZtx0iegqV}Q^{LWL@oh9EQ
zu}#@0(~4I^+fh`EpFbn5_&k6`zlM0x$PGv+`Zf8Tj6(5xkYeW)=iOv&Q$9}vMMeC<
zh22?X04(C9ZSirMEV>MbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD
z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$
zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@
zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D
zZ^<RLJHYHK9c8AMN@joQUNV<S=0NGE$-GZ850u_V#%=pNbPkr5GSfw}Y-6Q=qP05P
z#~|>iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK!
z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL
zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$Qsav<N)6$<gJNMR#&d0og)?GdQ_Rc!eKP;8
z6x)G=!`I8G%-?6`KgIb~7Jwo%P9kPIiz>N2%l6qvi?O<K5p3GdV%7|258_~}Aj>Xr
zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3
zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov
zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69
z*@2{2Vd?x<M9w>`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO
zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI<BP
z|F+U7oe*>1Q20jRX21C{Qe|%%5yso7;*<mIg2o+i*_6MRQ5Re|<*SU&qn2|y&bKIa
zj9t?W3XliqTDH(qhu%AIUNOX-?Iw6YOMVrWN~fCO*EINJNS2<%A*<B!MGeL&S#^&D
zA5@_1<|Al<vbY<>yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+
zFer6ApuvZ!KF^dqsKATDknF)<nP9AX+I?mEHPs<;(e)^52mZ<gWAmFK!9Pdmf=#f+
zzp2mI7Pvt9H>JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug
zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu
z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n
zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG*
zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob
zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc
zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~-
zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P
zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3`
zm&>@*?O4zSe&<Ji1O89Kj3<g5j#5T%?qlroBHO#pnw`#?5pau~KS6@W*@gI+ly7(5
z=SUuMRNP_5yFWI*=!zX3LAX_<xf!JouMqp9eSPs@LIk2oxp!&X2H%>t_Er%H#p<w4
z=>}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52
zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j
zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=%
zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5<D)6RYFg{=@37VhU
zlT)aTJd=?)ex-vq$na_<cju%FLO4&`MW$$J8J3aNHLEW0t`<%RGs148{s7Jv&t=0B
zoSBRlxr?drB_nwKG*Hj8FLwcj5qeT?@wvX!Xs)Zvd3rJy4knV3;1&JBNH7-a1?!LX
z4qRbwP;M;jHx_QFHx|lPG&qEa+{knx&SE0in~261qKm<`>}hycR`?=#i$pSVt6CJc
z-YoNClPodGQeif+><at(2CWrhndKRdL<P1v#e2`FAH0XHOlAF5?R&7XYI4YIE9}2W
zm2C+b590OEOq-Anio}yWJ=~<#w+%1Dp%tr6K|jj&Z@MvSD^{GW+}&F(EyB7LEW8l(
z53<dnU&x&04zvjl5HetVmcb@Eb@b9Sv{^^>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1
zL<Vikrhn6D!aT-Q^fpCd=wm9nF9T^qKgd8TmL>~SOqbs#(p9-kCFQTmWW`QxLo3?E
zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-<QcW1OV<z}wE_D?FIOr&quC!NqA
zDVcxE$m$#NNwRo0kfBu33Pooqsy(XfuHZD$_zH!rQdIjgEvO<_D|(Hhxem*fe-Au&
z_ieO$CP_MQ1>T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h
zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104
zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^Viw<xq{ma<pu|4qw~$TI>aoY
z_gl8=I0R<N(|t3Of}Nd3hk6{!p!Cn2N&DI}-677milyTYaX~t6jX1J)!p)>Ogppw^
zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw
z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW
z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH
ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv
zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju
zIR8QW+>4C?Zjs@A!1J|}<puC``^gy(X)m80&W_md>)d@;ddv*fk#isU`WKV)+2O1v
zCP*iZt3%Sx<Z+Lat$foWcl^9t;rhEwlw$atv@>};{3h^9QzYy71K|8_hn`^4w;4|>
zTu*M{G|BKU1?;?5055^TeEC)Y&-Z<ibAexo{#dK}?_rg1J@9<r7vY3$KKqwq*3ZZ@
z{;w|}f1R|Sxo(1$p-<v-gr51+co>m<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ)
z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q
z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa><i`_WH9F9K(I^jxnl&#MLOxY2$pc^y&L
zDgAU(G$p?u_<mYG|I`$~mleRjDD$0s-?$0#EdQ7~kNWy`J#n1h?ws@6SHS+wzzuaS
ztYmiq`TGjsPZq#`3Y_IUp~};tGTC3i&o30P^Hu?TCj3MDy1v>EHNZKZ<lgV?LjS6O
zeD3#(mP+}I9qo%I>3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ
zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke
z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE
z5c|5hmn~h+38_3n^7}<Te=O$Tfw?ohriQQQV7k+Xz3K+}r6mg|KWUUDniw3~-k0#z
zA}sSa*2)N(g$?rSPMBmgH(!g!+}!!GFIeZsu8^Y7pXdoiTIA<>u5YhzFhAOq`O=g5
znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm
zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X
zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m
z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-j<ER*Yf`L
zP3zj4AxE3(X3npKSpdwB#?XvZnKrChv#E8nZ*$Y?_EzSQ`8buwBfm3+Z%gqjQe4?g
zPwb+_>ThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7<oICb$_FT=&
zXZId_G|l6|3J-g%<?C={&2P8mOHI!<s|5l|TR)+ePbd8mGz?7fs1F-4%5H7_##g2m
zbffx;SaxyKpM`6U1oGu#F0{zf%kRqNlPO@afN1)AW|`9QRGCIHqgc9N`OIXREEh1i
ziY33OmTl2oP|3#RZmpK@&t-2Wi9%%0;AfPVFe9#$`F!}YpXwCU&ick&z7PA!C6SDw
zOkMRZqWbH7`6|k9v1RL87PQQ7y=5n7eabF7$z0*e#?W?f9bv86FD)N?la4|w)L+XZ
zpZ%phTnuu*i^uQZDf{XZaaeqd=jO=H;p>ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ
z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^
z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI
z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+
zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1
zp};m<gQjV7%Pc=O|Lw;Op4vgn+`v*L_4LGo+jI&pd)O|{Ou;2UN3klWqsmg&-R)n`
zaNd-2NBaI*uX8jiU`~!{%38xkU~o&H*4OJHeCM0NkjKnHKfh|E4iT%qUN=c7{c0_!
zXs&(>aD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*<O_d=Y>!hEw
zetKRNI3Cr~`g*<Q1S^H0f3I$GrVSk@y#asxt+>u#uj@EjD1;kLk}kiN)AS~!^S9@k
z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!)
z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E}
zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz|
z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j<mz*gp7qz|*Xwrs
zlzu=fDw<pWpXKPM*Y_?r75KYiYsk(21teltL-t|H>3Qj~b>#4qH<P4~x%#gng-w-X
z0E3?IKCbk0<#YLWpu_rWeZ8)D;yvoyq0GLc_4WE=O6gaq`s@Cs^VjWgUrzpdo$}2M
zNI=l?CYRwgJ{9>}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C
zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL
zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ-
z9akcfG<KU@+n*^vf4iJd|MnhJf4@o6$6VRzNXaRpl<ne$5rONY){w)>ymN{u#cO&^
jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T

diff --git a/legacy/dsaX_beamformer_passon b/legacy/dsaX_beamformer_passon
deleted file mode 100755
index b08ed99873c198055c7e078c5d1cf0100e9af070..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 178600
zcmd3Pd0<r4_5TDCh=@*5w5V93MjKoc6(v>bL^Aq<K_iQ@nm`DPvczNp!GZ=SQKn<s
zXl;uH3s!6WTCGd14G|%%iF@2ZP@+<O&rpfr5){qv^SR5LJK>F6`^PVuym!vM=iGD7
zJ@?#mml<vf7F^OJDXFJV{q*qt!k2{i>X89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+
z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX
zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK)
z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_
zaMtWuQ_nm1tl87fm_2K5Wz`u~L(e<oymQa2n0MycLT~bC-AgYYD_SBO%9oB`%G@8n
z^Z}MvrUfTt&g(ocby3#bZ7-&5dgk5QJ6JXczohGp-&HBT^t7bDy&mkB?5pnKOJZ3$
zek<{tdHihyH@#ndT+5O^udFC}dh<Vaob!6qKfXBP`ExJp$k`hmc=o0*j(GgfUmQ`D
zf8CRVgRd8WB;n5ou<sEbr9}|L3%?OcdEs{^!0$`I|CI#zH3|4^O@MELalPdE96EX7
zm%+GR`1J|!_oGw2>Mu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs-
z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-??
zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-<asy&|KB9Q`xCVL>je0pB=F~>67WAff&5ts
z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U<AtC4s(wNFe9Q3Gg!#
z*x_#p+Fg-=&ru2Fc`t!p?<8o~pMZa50{n~w?M_XApOOIoRRVbyB#3W|63Fvv0)H5p
z0AHK{|6&4pu1wJG-URrQ66l+fzz<JNQ2$8uV>*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t
z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaM<u8~IzhV=65#y_@JvSg
z`HuIk8a`D>rSttfVTZ$g#YwpWe*xs<Pc8I10rekH^~XuH?_-7c<w-soe<a2W>z66~
z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX
z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K
zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL
z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW
zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7
z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y
z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5(
z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR
zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji
zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f
zZB3XqyO2!S<WzY0Tr?ainF$V~LnMNd^A;4%E}c6gG?O~cD=)mPMAa8=3r8&~ttu%k
z6MZeD%s!_aE-`Bk3P>>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G
zD^!tR5*$4Re}b?Cb?(M3?`)wo<t?2%&6W1@%MPk}?t+<9LacnS<%SE(O&F7Z5Rp=C
zM&*?9X+`CwvrA!Y$>E2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~;
zbhGGEFof<oZU@hsfgb7BztRX(O0KV*RW70tY(5=Ptb2d|Q2i)Yi3oAQQFG>Skj#<e
zqvHDV&~(BxO5h$MQ2UC;&RsAI9X+bFq7ow$g(Ye*!GX#U%Zf^7UgxWrIcG*Gqfi-q
zDl}c<<hyS=!pmIWJUA$V1_lblF6vKV^;s43N^)|FDk@5*%$@G5KyVDr1{!g{<T^4b
znm%RLY}O{e0`5=}A~Y@JgZv_D!;7a?mVt(|0wxPd5GoJNo907VXx?n!^wN3LeH_mS
z@<3rQp*kHHW2b7CA;x?q2L+xzZ-#G9>6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair
zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<;
z1Fc}XEcc<!%30HV)1}d-FDRdd=?9K61vu6Coa=-%v}C#NBLId#U`bvmmpxKc#IB{~
zVZ=F8X3b@9%q#Oj*Ah%8gxJIMKi{PV`4<l_I{VDC&m7{rbolV1a}a?AZrnJ$pX(bw
zZd~rg`9;}ho@1iVJUiQ^JlB0wRlx0h7nf}mgB^GKdSZ8w|Md`mQ?O@9s2Be0t={-d
z6?j=k;#n&dzr9h?Q<ReC5Jl5NU?iVZUmsu(_4O58`l5t8lB_LTP5}?r=q30ON10Nw
z%Pnh3N!T`9_W2IOU(y_|%4L0Wm7FD<gc`lUB}wo}!jlm9NRxcak)w0JG#$T{$Id#$
z2VO5AAL9?p|7MgIp0e;zwz<tHXR4{`+s@dIl}|5F$)6-hKOeao%LDig4ZZ`-ruyCm
zJ#nu-aviw$_Weh}*LVH_wNreZ2LCG*eLCt(`Fa}i_42hS`B3cZKWo-uKG?Q5S|R@<
z1$?NlpMrlP;6A?N6ny9HzlWI)@tv$-$v;`}Cw=G`+M|ar3+4RjxM-?)(zYS-CjWFA
z_!$OX=f4m!uD;ISiRWIj{?U0l@$)p2PnAa#UuNKSo=*Jr27W+5%1JZozF={>frnY$
zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn
zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?}
zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^
z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB
z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y
z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un>
z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L<zKdi2&pl}UlWO2E(nvhh
z41B=A4>0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa
z#Rh(afuCvMFEj9E2Hre)tupW<jr!FF{&EAq)WBb1;Oh;1p@Cm!;IB0Bs|-B%3iZze
z27a_g;`yk7A7kJf47@zymW53Q{wkw>i-BKY;9Cv+)dqg6f!BM3EN(OK<Bj^ZfuCsL
zI}H3J1K(-jCmVR*@R<K!W8hN_e35}qGw{U*et?0WV&KyaylhDpW*YbsqkfiwpJw2P
z8u(HJA29IK4ZLOGXBzlI1Anc7A7|jNGw{U*o@dhfXQqL#(nvh5@a|MAno>9LM4wgL
z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j}
z2|p^}sf3v-I;#XcfiP1<r(VFL2tx>;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR?
z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3<Pau32x<(xJF
ze|a2Wrf^QHfcFw+>gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ(
zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM
zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V
zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4
zrZi5vfR822RK`gY@DYTW!Z<zw_aV&G#pyU8`kydU7N<?XUmgpXsfyDo;Jt*IqBu<g
z{)jMB6X#I@zekuUiL*+;ZxLoH;?xWH6~atGoN56-OPHyLQzqaIgqd<U#R6_5%v8fE
z6!7DOnPNBr0Y6BXsfCjz;6D?-kZ`(y|45iAhLa}XI|wtiaC`#(6=9|nPRCcG{|N^O
zw+Z+L!b~BYRsok2zL;>6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j
zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+
z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z
zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1
zbE<XH1^h?CoMN3c0pCHGQ>)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_<Or%Axq
z5-uV9sDP&u=2Ypd67U4VoFbii0goa)op7~)M-ZMtxJ<wo6Xpcu6btz0gl7>h6!6)E
zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX
zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe
zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz
z$l0W<WJPx7u4aa?_95(2*$X?ny2edd{c~_*g<y6QEE%h_igm`K&iHFqk4{kMLDtDe
zoub@vtIJd+lM9%x{XbUfi%PlUb0@5>OHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE
zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf
zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i=
z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb
z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@
zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M
zo9<Xg!GhWUd5V$3{s_XXV?1lF$=Hp@TaDtigReWp>!<3~_9GzWHsD%;uh}<u-t<{p
z0=<0G^CE#{L2pI!GScjJw6xEkcE;!rV8hj{yU*Y6^b8gbkW`?sl1bW}m%(|=*VwyP
zuCE~SL1d5p!lzwb;m)p*KWtA4cOJ7~S69a7wtl(3>Ha;hG*%s!>)Y#3OYwnq%rXr<
zz#=a)JHy)NpU^*Jx<B*Tz5ae<GE(hoIF1!9$Vh`da(kQBv`badfW}F>06j%R+apl4
zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg
zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA;
zuXzXQy{ffq^<s#|dPm55>%puI^@>v);6T2$)hLd<nY)@Z-rzSPd)F=^A}=Eq48EN1
zpF2AvwIDhWX6oNHJ<mTc<xnAB+1qSj$l(A+)+Ilc-3923G*R@;@mqI^=1(TP&;P;3
z>HhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O
zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70
z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia
zxT%Nel>M;#sT&N^4wP<Dq@5tW@L!?RC7o!~iY7x0(a=^j^eW!jyRV?3mr=WaeeL$D
z3nDx1@MhdkYFaoTvaPW~d)Yd65`x7pI6ClkE<Q!#p9jA2PcD`oiP}#BYgfP|(I><h
z&5vxsgm3QxrXcc#J#jS~YJ33W!@qPr)>=gV{%L`1708Exw9l3dII{C2&17(|V9;eB
zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C
z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx
zTrEUR!7=g<1zEedt1CZxQ<??45N)q7ml9neBv^!a^xiy0e#yb)Hwkjoa_N^SdOs>R
zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk
zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD
z3Ds<c$r2d2ko|_j3=o)PU^XetP=WDf`RvCPCg5UL5feQ(!-}rSATN;vHRABoXQx1t
zbv|D)yaS4?<3TM&EwgtE5Qm}!Q|-3_7S!FDQKodMDyX|HqZJXmpspsP15XQZ8VU;{
zi!!nbBCR7MUlc^9Wel~#P3Z*)>Vc7w=gBI4WMp%G<i&!>n+1_E8O3&g2v89DaIJ6E
zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I
z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC
z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^
z5V<d782}j0THn@#`N&d$!aHI3H1r#XAsx%Ua5>u#WgrKdlu?Mm3~pJRAqQA!D6Jr}
z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y
zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE
z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w
zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;P<tR`F
zgWFn>kL{<S$p}>f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y&
z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e>
zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-<E!=>yz2gE1IrYFXbA8s
z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3<pcva!q^@Na3Js7G)
z3j^A_x~?hu-rZFEzv-q?7)$J?TVNBmod#vuP3$hSn`F1ikpp{ucQ<VXLwK#lE^5L%
zeMpR+$ZZ)7grPUP$mkN-=K#G8oy!_V50OcWViMpJRdv6Td4e7kBp_`iiFOUuqijXs
z0Y9L6oH7d+Q?QT*<;;0)>pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R
zf*#6N-BUjM!>-k!Q`5x9ki<fOO||MU_uE+UEx})|hkB{?rBzp7jq20=gKF?#K0OYT
zQB$&&(^|RD%4w;5$EsV(5Q#*xmlfWEg6Aq5IDc6^XEPO8#!!<V{svRWJ=m@&=y{)r
z_u<*-Lw{{GP>q9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG
zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya<Rwyv$Zs@i#$y)Li2X<vx
zng&s706-N8nT-4kHnK1;&dmNCQ@GvmVON*)G>%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~
z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3<JmeT_Gxo}{-9$@SS2
zc6W6NgD{SBX+O<6XP{W?SB|lkY~t#F;^f>(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>|
zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_
zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y
z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S
znF^jXAYME1<g0DLKzhPr^CAL9@7y6HpPaBT&V}?pXu=u?_KY?bpNK|<;(42dbyQN(
zP*_k!AJ&6_oLTV3*YB9HL_;vRm`?6O&m(&&{=O4dI+}rSnRr+T(P?;c!Ww{Qf(dKJ
zF7AeX@183DArscO_SBy+a`l|W=F?~}^pxl^m0GZ;WY7J8Ijb6WhU<zsYbJZin6rur
zd(K&%=p6PhYZ#rxt}atd0(^3CsIDb*RDA%(#hWP7S%)A2X%$Jdcc@NfD*_K#&so)q
z^`nZlkOt*q&bsphZzo7JX9XY-+P3hZ6J+5@Cm4!n0w<XI9=A=uw-aRj&z#_{e@iDg
zgZje>Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1
z?sUNzu<q$)s$2}$k1&`pT&H#)t~@dD$}Wnb2ZWnRt<h73c(SL);hCVP8rra}Iw5x=
zS`fz45g3bAm0L6+3f=*;=0yuK3apYv8AF4$&-iN#z_TD}PoodzTunUkE*RA%|1{j$
zr!p-sn&S7J=nHZ@c45*IGclHVnM<&G5fn*lGOB$jShAU5?vhOcyF|_u;ywITvcL9y
z?D}Ggz?w8SyeT<<$rgUI1SdzXGp(da8O6Cv4v>+*_GVNq!2Hgsic?L13(c7Yk&mrt
z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<<S&iqrISHnIPSQ<KhwY@IJGh<|Gb<OM
zC;XaRR%EO~J+zXJ#ZH$0mP0U^iiPuiVnNKca{wFm(KQ3h;Cc2DJIQU$z*ZQ|PC=&<
z=pfMfz69u?cDn+y2)v_!Lb`4kI|MHf5&kN@@&U18w(k}qF3K2S{}yjdUZM(9XfIK3
z<Lm|E?ZOOuu0o~S)6`p;Jwd%y*@fb5jY!7r64|s^-`ZCxARU&l^A(Uq;6ept5;#i%
zLkXOyfB=D`6fl55Du9B>dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ*
z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n
zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#}
z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b
zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0|
zd+bxN|2_*l>Qk`SzCC5x3#>N-L<dq<;N3s?(vDzGOZmp6mch^Fyy&m7u`0k2OTW18
z+;pqt^YFVjh1aL!EFyPl!4O|v8TLK4phjNO2RMR2xkC5w*YH48pwXQgjiZfa8jb$c
zXdF{4(P(t2MyL2|uGMJts7AwuMr$;>RHJ+QYcAAi^r=Rt`fE<sXmqMZAL6e$T%*ye
z8r{cV^92vss1CYSqYw4hv}rW@Ripd*Yo5_)bgV`n=C65Np_>j5e~F%yCH|V_0^MdU
zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO&
z5T^xp18U3iSuEej#&9C!a~{D3pT<uK_9eKA;AaHkT0ZA?f}a!YL-1yTUl2TmU<JWW
zf~f?j5!_FZ!Pgl}@JoWd2wp;P8RT+?V7sz$Cf;K5IcE^piE{=gh_}@@L2@bXJWN^8
zgo#jg*J?g;R~Lg{BVU~vC~f?TZSd^Fc@GErjRy!KGWeWV34TqGHgh%+{DvS#oX`0u
zLGD1yhVBQ59Xrlh7(8-}t;VNhe5;9aR-ib)uC{tt5>B^j3-RFW&lN|m|4D4>H69l?
z^%S5G3t69i(RNN}62Ku>Qo#k1s<OrTJ5T?j!zS-Pfy+~W&E?RDxkv}-Ai)qUIPF#2
zc$kUNb)1O4hh?EM8XFq77`5MeOZRRR6POhpa){vhc_>FrF^$tG$x*1}yoQXfaUAR{
zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP
z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ
zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<<opC<l8*HU4pmi_0!MP`z5^Zx$a>e=l1~n
z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10
zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE}
zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&h<!?eqx!exb-28)llX0rb_N#|o`S}UhE9o~D
zg!WvRd-9f)UjUV~PqehI1z(V6g)j0|rnLVSbS|?`NYK?f*|`BUF?rC7{Xp&X-`dqh
zNpr`H#jS?XR^)3~GR?TgkjLF^tM>ECqpaFjLTT%YF(m94-{4*Z<ti8q3x4UhBClGJ
z{uv8!KJ!(Q6<zELO}A>dR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn
z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw<?HHQBe{ukU5}pG?kJ+S|qd+J5CtdIs)e
zFb$1Uq>49kE(A-l5eLDnIv#OgLdJ<?2qzwOi;&iEbA{XFGqF{a&hdk1hI%raBFq$v
z&vVstD4+IL*b>e#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic
zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd
zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L(
zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC
zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g
zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3
z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0
z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z><?*y7Rfu`i<*+5S+=%f4J4%0&
zJ1IZJpR!&Sc3f#MfMfmt=}(i<!#~iU<T=<6@u$pI<xlhASl#@I_v7*-pYwKHZshf`
zqp>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqa<hzh0Fe
zhc_o-b_su-Rry?QcoQxa;p_uj6{)@fNYj=aV2V?D59X5r#_s#{dhBqwABK4_U))3L
zN)KHnS_wuv_g(BCm;97qj@Pv0r~GHG$63Z3Jg<m2d6D<lK7vd@+=Xjk7$XOqS-_F)
zH(x-$8172P83@42hrwT|03jhE<n-5`h<Vl-g+C~bjkmF!TGw&6oSBOKHe9Np?ry>b
zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)<nmu~J0+@CF2@rL1j5N7#7H
zl&>9l&62O3cpb{uoGq1Guu@FVU(&>D8u>Z<EAP)=vVolQm#pVwWX`LVH)9gUHj3QS
z|K&@d3vWurozRJsofpvcar)$5m5YSobEFMx*T7uVE<f^`6?qQor5jNNlL*!|*MMBy
z?Qt5{p|-!K5|&-t+8e1>WG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA
zEd&PlJ?sNuz>}!<b7fxQKXU^*^;q=Oq5J&Tw4ej=OuH;u0hc9n+om9ApTFi-w1J|Y
zs(3sOuEswDHw4X<>-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8
zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv
z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3<P9!i~o-b>eG0vEow
zHP(YFZUMOfbi&8mH`>ebZsgM<A3T6={R~cRABIp|+f+4`mVW3CI_n;L-CY7Vo$t#8
zZm0b>i913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f
z@i*&uf<wFfj}BVUIDdgPMWp)n`Cr5+{89OmNDP0({GW=6>U+-ru=Wp(*9Xu4ZocT|
zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E%<E|U>
zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y
zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7
zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8
z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=<rC(M58>X=S2UV>mw~#{8dA2u0t-k
zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj
zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL
zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y
zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+
z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM
z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb
zn%-R~(7Ow<eFc7IkNYd%e_tVegXn^L>={s1jn^gL_<!c}jgIWQadP#rciue#-}=-)
z^3J*z?#Q+wNyS(fdDyxpVy=+_z9S&3IpuK>Jb<pSPX=qbc}vQ5)kOiU@6qG!|4{8z
z54&D4_eZuiHlT{j_fhigCHcO;ju7~cK-~dHf$t0JHD3~~D<j`S$hSf99XmSmI$Z+m
zRin?)4cSfA!+t87JU#N!+O6b^-u@cNDUHUx&Aw`Hb8F>rxeUcX_N{BBJoDic=!h)>
zf;?}vo!~2`p2J*u$Z=gg<yi>b+n3u%K&<KYTu;lz^lMx?eieC>5h+eI#2foF(Bwrv
z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm
z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U
z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2
zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG=
z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO
zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5M<L#y-35gK@1WVAE~e05kCPAA$G)J
zY%7|K{cE-r`G^<mUth<@@G&|H;rtVajLPn^&k-R~(_piw`6qYnEwEgET@bz?Hh(#H
zaNFHp2QN}BpkqbsLGV{il#3={IhLe}FH7J)1fo?c<=bg5(bPQ^?Qk>z>AtbgkCFCt
zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I
z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7
z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y-
zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2
zs<4-~(+9~Ey+j{kC+#U<KYd{ThUOwRdRlB><Ck!SGbTd|D{?_W^yJI%RuE}Z<7<~5
zUz;E=^QqW4|05L=ADZBT1H;Q-`)9sHuEW{SSw3fYmxyovC;MkC9i8T!29W;LFm~>B
zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5U<Q{Kj3ewXH!l(+C!jw(;#$ph~#DbL^ycdnn(
zQQil_b^LB_`y}Y*ET!+seSpYLIjWu$y^3=t!ydvIH?s%Ckh;LRl(Gss_+fGe9ML%!
zDB)i*U#9+bw~WU5BIdtv=jHxex8NuppHQ8N^L^!~)x);E1rhx<>rvG3)hRgaX5^o!
z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~
zDBr(L=yyEJKk+EvCd<EO`96>G)w29<w!g=t{4QC(nDn1|lrNO!LjDeq^6|3#Y0~fY
zD9=&!EdSV}{1`=lB+CEeQNDkx(Eo1S=4p7|qkNkzUyM%>HoWaozFL+~hC?^3@hHDb
zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+h<mOoAX9gp(;Z^ra*
z@hIOW%Z2{iJjz$ga-n~NNBLc{T<HI<NBKfoF7$8mC?7A&h5oG`<vFT-`ro4-<;SS@
zk3f05NBRCYr2q5M&a)on+hqCI?7!_E<*Q}6uz#aR`CYPH*#CWx@`bWo_}>PP^6|1<
z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BT<HI#NBLc{T<E{k
zqkN$(7y7UDC?7A&h5j#kl;_Cu<5QvE-#yBYk>y`=eDC%s-~XD>U&#NQNBK5cF67_g
zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{<cT?
z{#RrAH+z(CljTDHtsdp8Wx3G*UmoRm$#S88n@9OVSuXTn=TSahmJ9t~@+i+y?e~Ix
zAMq$ZMzzoJ{gFrc{#S(lV*EVgQNB%<i}CZ8NBL@5F6_VBqx>#eF6{rFNBKfoF8pu3
zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^
zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD
z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT<Uu=;v_(ih
z<qpa~)%GlnLEE!IFLqIOKC0V}?V=n<`U_ZF>`d%<Ldt%-mIm|}>h@Q<^fB4DLnfz0
z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj
zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ
z{k^)jJ<vn`MVfx+-#zq?mwyM`f%`+*nj`hs^ea8+w?ihUr><?UQ1sVp4CoKMp!N5l
zUrzery0$$@(SNc}S(N%~`avG_6G^{O*R}_G$iGO_?|j%p|9JU#z#XW6KP?gU*Yqnr
z=(j^A=Y6;R6^j04d^|#r-viHU{XOWHlm1>^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN
zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ
zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{
z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw<KRJKzon>96TmdeCo&Os4*d
zK2HB<wEiCS%SmtOuju3S*Ytxt=qHli&|lHV>96T`{-1~bq&M`3J5c{CJB<F<^ea8+
zw?ii9BwgEHq3BKd5433gJ?NK{J|_PpMgQ_AvG&(!`avG_6G?xsu5Axg^yc_kr0I7)
z;GsY1W8-HB+~FYoHT_Bt`t6X()L+rZ>EEpN_n==+dP9FjAE&>jALKzlk@SZCiat(%
zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?<G>cJzX$zt(#PV*Bt>uf
z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX
zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS
z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af
zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe%
z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH>
z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c
zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl
zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8
z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(!
zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs
z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+<Mh|`
zgFNUblHSl?(Z}hp>381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8g<y#bDbJ3E8*
z9im>SC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f
zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X
z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x
zH!gpe-<+Jmt8uxJNf~J)acyQmKCX<U=i@#}CIs^3ZUKeQz7#%XPX!0w$VlOzBPDx&
zqB7@A+_HEbHe<_ALj&GI7y=4;o-XcB;1+r(*csNT5#L1(eQNQgV*PLx#b5I=YFKq^
zGOF>!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c
zK&3yaN~epPMd7X_T<NUI;LRJn@%1)5_>Nb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU
z1=W(`u@pG<dZ&7|e+Og1x$^DZkih;Be<B<0G)lAGUrSTiJpu6Eo(;h8iz6xNl1;3l
zTtnSwK;M<&E7ouot@~9%h}DGZ;^q{4d;q^U40Z8QrYZ{^i|bo887-*sgYOsap}@G#
zavuEgM_wdsMZ5N!ckv>jv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+<?lsz+zu~Knwyp
zZCc}9DfmFiE%+m@&q#&stc`R!+<Mt#2f@yY;u~AhTz+^H?qhujue@kEuyw&Kt2yYS
zn@1+&`pujH05ee!Uqiqvk&$5fmSC0~5@a<2O2<oNFQY^o?MJ^dBpP)XBud2}Q=&g@
zP!jF5|G~BolIUSLuf;p4%{_%cybp981R8D%G(3IFa95z6_A}7U-UY&VsYWTOjy0w7
zQL2`{@lvJ6q&gl_Ii>7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz
zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX
zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b
zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{
zblmp7@-o^hs88!M47+jiVPPU0m+<US<OeIiCCE0~cR*@qABF>O=!G{V!GHGvWph06
zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u
zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag`
zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS
z=t4fXT1mR?0MX58L`m8lu=w7<KhJEWc}8>kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1
zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKm<cwbti%v
zWo?NR!eN3f(LgG``F(_a_iy1<tdW5l7QbW>lrTlgYmbPAqa!!vM?S&#0f{gAvg(Eu
zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r*
zvaK67h<ZH(QJRCo5RSOSr|6<%cUkM->tWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89
z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75<
zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf
zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J
zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e>
zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{>
zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h<xcSb5dMVLd}8QGQ>33fym
zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I
z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed
z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44
zuQk<!U}(1j5!7+7AG1d_CWkFPe9>621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_>
z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q
z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1
z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy
zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L)
zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-<gCV_lNM_xjqW3+
zA%-uh0u8KHsFV$zj-h$GYKR1NLrTEkjXO^wNXd3y2CeGlgsiDMbU0SB9t_s!#j&<b
z))O`B_J!ZVmn_#JD3I@l8NxQ?`_5mbZKTLqCf}Ye-xI{Ji3VHY?4ABw-)Bk)lQ)Y+
zW&?P}A{`?o6T;@;fhW;qbnJmBvO@&e$jkV5D}+JpCfr3qr!!$IQq3bBJQ1b3(>rcL
z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R
z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b
z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W<D-K}A
z5wVH`jEYR_+=}>2N3(rGtYVr`k?A=<mnmd~gEZTjzj5{a%Z<?Ufv78Ux+ZMqix5gh
zA`eH?=57x1OX8SCs~+q97q`buKN!cx4n^Z~DCT08qgWz0uEOLhUWK2k(Jk7vXQ3NZ
zw+tnR{lAjpHV0Dag{t-T1xm%V=-5y3)the2!1q@SOHg!hTVKNzs;%c2qpdpC7J5FO
zZ?l-7!@*BQQYzx1n8c&OPm#ymS_Z+idD?%eninh2V_m@TznHbbX+T`agZSqulK28s
zc>DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL
zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!<A^PSos-`taew5OcNaKYO}i
zF8S8LsiYes_of?miWw3jiqNJU&^~&qh}8_1XUDj4!0KRm9J!sRxwYq!m2igu#$9D!
zg3bKrkbLcZ<U7`mqCHl^Z(+Etw_LP!A}cIRpcMP&3vj8CEA9L5aWy;Hkfr&TkmW(R
zungSzvTs8z#7B45yl9H6*2EaAvkg}FkyVk)iZL5{kcTOuVe+2%oc?5;vBz~r&k)VG
zZ<Q<ZAfufS+xY@MB1Km1&|YRc_-aD4dy@sndtw}i7#uGq$G>#vh*%TH5%VNeqHiN4
zB9CM)85KDwapSltgq;*_o`@0|_vf<<MR_}iv8KQi<&py53hG)Rz;%ZesRSP^0hsLR
z*3>S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx
zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7<VMsbCAN*uG9CbP?2
zX2F;^Szy%SijxHnyZ$7V;o_)n$TcR1LtG9$!9m#u+eXrW2*bO~@GetA#sfWOr(PqR
z`!uw2qilsFbwdW5jof#Ow%VWI)M|5~BdTKfuAB`Eej|-%3{aS*uuzI2o*h=*2n=Bu
zG}2%7uZzzF9?L#~XNQPr&|m2QCSyCc)byz4c#31lrCl_cj%T;)-UBNnT&Rh!;{F7#
zM3;ms31`LUo?HD-reS+<DNcrhHfJN*+yy~<0T)Jq3ImeDLc4&&>)A$&>*6*`oB4fI
zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@)
zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|<JPZF`C!a7SPk
zM<AB<C}MKhA)eBYIP9e*;n~=r?m}qgq;XkH+A~mHOItu`ub{N#;arK<Xf_Vl<jO1g
zDh7Ad61JATCG+Dw*pEj_LEd|oyOuB)S?T1xuU7N^3=M!Ggu^gqk`Nt{X>@;wQ;!Nt
zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva
z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx
zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6<ZEicQ*C~pS9AE
z!U62pmatbVlgU8YsY7o~?q2Xz_2PD4<ze{BxRiCAj?T)%H~9h;`kkv))ZGDs6-Hu-
z8{2MsgW-4I&B`CTS%uSy_T<>U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI
zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB
z|2Ix}C`ZcXWv<}nrok<h5d0Zhh5_HX0fescRp_RZ*2yMSyp!>daSwhqrX^Jtx{-P)
zoDc!mEeI4E(OJjSLQ8sYhnfPYYb<kfCx&JpEF?PuCeqtH;l-^!kqHVPo`QIwtc6$s
z|L^9KN<4^#5MV@iY2bh%NI;voNYW$-0<^9U%a>g327#%6a7{ee@Qu+Sn0Pw8TP9=(
z%OVK4Q=<q1!X1=jF4?_`I!B(Ki7qkD^n#uEUZh|(%3zRM7SdH@X8ytN-L}(FuKlt%
z+y1Cpckd^{-6UzCnEwnkUE{$RZMH$XuL87xcWM7=(qik&P3)Dg4ZF(qwec~AKQkD<
zMTV1IhSR{XIaq@V`r$l-_h_&Ndx4uDr9Ucd9h|dhmEk7XAgyL(=5}IqOlvABqM)#H
zHdIXCQhO5!!Eq@Ww;vM?E{%?<t&aw)b2e1=mZeGI;L;=%To9}&-&y^%AgIKKmoQ9%
z1|nlNKZ<tIRs`(A!D={itiBp5XhR;Ngo8D4<?4u~#((SahFZ4@b*rsq(Y)IFNbt58
zM@&Aqx$EuK{DM}wh+-O#asBcbVe1@wWtDV-Lqk7DN4|Tf>jdT)|5Z8c{_h*L6L9V(
zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}<L
z$P65Wr6T%cwIb>?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj<B78)v>>-O$ygMq0N7
zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP
z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a
z9=pSp+033!WCQ<Pqytu|lu`|O!AcGJ5iwTNW30^B*PE=aby;zZrxv~9O>W7Xu`j$k
zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O!
z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@
zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMf<nXkZRo!Uf;)<cHL@7hmtWL<B0D&
zbQXO}1>o9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW
zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(<xbTl7<5bkm_JBH
zDids<FIkG*rPKeUTw2yNg}5?_NsI}II>HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw
zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav
zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We`
zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~
z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7<s+_5dcDE
zFmjy)=1*2T3GSj>DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB
z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9&
z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?q<k=wo)b?Htp
z=zd1JS1L7KouZ5HTxS1q2On~Qg}Qp>YFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l
z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%<S00uhe7*VGvkgpgAZg!wkL
zsXKh|$`L~CF;+BX&aYepn>obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z
zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD
z(1@V1TM&U|7xSOFqb_JqY(V}*0YiB<p%RguoC%f4&f~C{lDmDXk`#W`_U&6s9hN)$
z_EjQbv(H>iy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH
zb(FuUWOp8vzD+}Wi{A-ay<wak6My7V*FQ303O#Tyq4-ypYw;iYUfh-Uy?Uu@oISUC
zavwnM``k{u{JXfPQZoF~w<}}ZZ;x>|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W
z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k
z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW!
z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c<VYH#%6ps
zBg6-Ikz|w<W7La`Zg&~|R9&}n$7%<64t|Cq6mMRSd65UbSQm5Jz`?eNIw0$El%v@0
z`R}1Z61y)N#cnyZd8`yd5Eikly+lvAI$7y<P%hG@wTjK=Yh5;XkPTtUW}xS>_=1Q2
zcbago7$fuIm<Z=n1j0W=6UFAMS*|AOWJCCWtx0dmaI|aTvsI6!BN@UZFM1EDp2s$I
z!E6zOajj3*0tFIY-jP`oZ&8^Q;_jbM+!fz-vl|M3N{?p1;k}z_P|*fYDbY#bDpI=M
zp+oiSA@N%s^=k4pj-q?TDknM^ish(LapjRSWTCNcGO|$~z>F%y5^ktS{o9Qq?2j{4
z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji
zIQ5XrMO6XorATY+FsZd&SlA_&Qh<?pO4538W{-Rq+p(1he!)&L6EVR#4qezTG7P4}
zTpV>u%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW
zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a
zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE&
z*v($uLGfRk#hg|ioc_JcVv;NnB>#<BOm2e(xBnl^!oJns@>_iB>U(&LNm49G{u{HH
z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz!
zNJ#7a4c$r#6>&~<sa3==2?l~7&QxOJjXy|k|CV@T@@k#h-BcAnF5ckkos4E?N0fuR
zXLK**1}06UrUWr_V$R|u{ZUf*z22Csxii5vyL;teTswC$nbwcvAW+ofViSc(IAWlx
zT%{{-sjk+q;DH63<gulCN$mAz{fd9@Z7JXJBl%R<OGtpt)%V3gs|*N~%M@7ex{ge%
znO|Y1It-rx(B3Jhb~{o}hG+>%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z
zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u
zCxA1m+~o5_D6W&wmI+8cuY=pk<g=eJ7gV)z>!g9b5<gQSk>QpVl$L2uRjb=TuUG>a
zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD
z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT
zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq
zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW
z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM
zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj
z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6
z{<W<T;I@zypl--0v<XU~g#FJ+dTfjZNn0bxahtLd4x%HV(#a^2kx{EH#TCn3>C0EM
zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=<W8z*?dkc;<rLGiCrX
zt}Mh(0KsVWDoBkjl$T^-v;AwZQCAD^tZ)r~3TkP?SK_{py=<}`9miwwA&8@eRYnWT
zx4C=@VtjsP@EJ=!g)X1h4PAbPAfklu$B0iihyx@((j{J|h~?Z#i*B`|<CHwuH`$Ax
z&e^CgOvUEft<vlZs<A55PCQbv(q>-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h
z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5(
znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg&
z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6<nonn3L~zK
z6{}k=s>!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw
zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu
zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H
zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL
zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n<
zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf
zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2
zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I!
znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2<v4q3wmZ(i+#P3~=qC{++r_mNir3n%*<uP#
zlQtLaq7Nh(b79e7x7Z`DTf{nw-a1M(-tA)w?YK2&Uxkb8dZeEp-&K{QJvv4^*f7yf
z(jIHj>SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7=
zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb
zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$<cBFTE!QGtb%<Io;d1i
zV|>gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJ<KO@{})U
zX{R0v4sNHMG)j;CPZT}<I`0MgHB2ZDO3ou&C7c`~54+4xGMT+`rDjGENM%LEnm%S2
z@ij5{poQvs#6rVlfo~;gwW1tT=rA#Wk-0BHE3%o`FjE~Mw_LC%u}W8AH2IKfvFSE7
zK3!Kx6TuT+6YB&nBsR@-9l`8`&4uWM7MiXLKWa}04QzznBunN11H9ZzooUh6Q>H8K
zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${
zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL<yWd1d^<`${)x`IAuq#bu8I#`
z;WpHcd((6j)r#;8_HmUMtC-^O4+@%7oEorVZGv}kJL^}R5LL#}FQ#7(RM7LpgY2@S
z4E-LAjXL@fv!Oc_N(rO6_e8HM=1)v_jd!-eypGHpWVb<-E=R2hz@gdh=y1s^W8~)>
z<hPJ~p(MvLrpq~82_l1uyT7&yG&GhQW0+#YbaFSW8KtVrVv$dc5_vAFMv3zv>fyT?
zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~
zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F
zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`*
zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?F<r5?_0VM%~
zp!hZy{|GcLCS;&wbz_2tc4BCDAOYQK6vH{>M9r)2dV08UR3-C1;rCa)>CW4WOWa{&
z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT
zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ
z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y
zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE
z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE<n-$
zN1@;2Y(>|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I
z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z
zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a<jpz3pS}Jg$;6F_fPj6f2m>3@4@4}
zUI+8oc8S4*-V5tj@<XDXXb{tlEg!r2*b@8@m=1~gln<!0wK>o~_L>%Z?HhYdjlCwj
zuY4f`_<pnfXG_{gY-QQW5DMS86UBnK<XIR3L+n}qlSf$owXax7_`lWvFQ5NkO(*-n
z80*ab{3ey^#UJ`AlX=$+VkWiYv$Y@k@Wo@jDvzQ&EJM&|9>l@A1@q_m;l-aG9xLL6
zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2
z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e
ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A
zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`<dUPuV(5d
zez;bByKA-R5<{y!xrwy;3?)BStKCS;zK2$K=q7%+R%zX}x<6=Wb&O!l>lJD0qgVKH
zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9<oZ)#1XTIK
z3x3UK-H<2q44qFFd=XF)D4&t<m?Z|dkv}jn_L`2M%BTth^&d97x6gNDe+AXm--$Gk
zK%`LP!kx$8#G?V$t}Dch=@zoB$-gk>GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw
z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T
zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ
z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf
zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T
z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7n<C^)TS*4^_U$D!vDqTafA0JVs#jr?d$@
zLJ9CiN*nRqA8tIg2ZIE?BYxDUeY6(eNBQ8Pt`Fj)z=!YA@n-1p$NV?196$AM&p+{-
z@c5}GUa4O4#qrbH2@U@acY6?vZv5gCujqKyaQur;JYTJye)g4dd}Z^4+UdI{>VRat
z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m
zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt|
z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB
zu2YA)P7R!x{Q1<BhS&4~JAThS_t>z$JlLlWbv<?SiKhs#>z?th6WE9X|LE)UCrEoo
zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u<C&oOD<L?+Fxb
z`QFP=9B3R@eK~wX*Y`+Qiq?_dyyJ<NR6R_hyRbrZ;!(0>-Vo<W3hh<led@V54+P?f
zFt0_}2n@z1Eb@4q087Xu?FCEv%=CBG16>$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj
zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g<g#4uE&@6)N!xz?gr=ZR0ILBT23YE
zHWPK15T?Vek95*)^*`V?@zhF19nPX*vXGUh#UQEu9@#2M?QuxWI>>b3#khu&@yEW6
z3DL=eXYdzT#mR%`@I=?igA?j$<H>`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v?
z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ
zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+
ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@R<Q0}UjAjkNdk
zA!0k0PJ(zOw&Rg<iS2l-#3QjCj|Luz?Rc!mBe5NiaXePa$96mt+mYLeM`Al3H{y}l
zj>m31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM
zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2
zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw
zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB
zB@U<}c$AY9n<RT@D!QL#hsJIGDnd~0cBhQ{inUwEqvm@mo%;43*|##;w;1(pzU<p8
z?!NtBTW9RsRRsU1&_mfbC@}>>o!o&=<Y8@Hz4}A1xiiKuU|R`(;^f3jauHFL7f7SP
zson}69{(#zY9IE9A%>l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV
z@<mAbW4TtZDhcl5Hr1kgFi|~z1hEi4i2!HdaAD~eZ3>wyUjmH7l^J`D+F!tdCT0?x
zS>d5jIy4Cd#cNKaptnN;$G<B%v#EnQvjw+_?ag+(><ehi>)9<fXX;VIQqZpv{5POC
zQi$u^<cm+b$-P7#%9MhXcG-ZX)eH2;ruF9#!SP>8TK^n7IFRCB;x_T|rs=fSIkdWs
z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz<R
zZQ^H%>6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2
zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x<G{}MZ7
zkp0JSoA~b-=^&MFD`Uep4(mth#RPwqgug|?|26gSb_YJP@i1m)QVpYz0gc=mnWV#e
z<V}D48LCbT<`@fisYcic!+&oZDmBb`cYeVpgwub0l6a84#9i-OpXYkCt!dZmJAsi)
zHMq`cFnsshq(ip>|MAOA;u%nlVS^@#&u%<C=PdWF^@nt`)^2Js+o2Dc(S8d$NY33|
z<<qjtr`%PpfTMsJDmoq|9?CkrN?!y-6F_u^-kxJrl)XnmA7W_g5x=R(HL$(FvtFVe
zevdc7Ofu7PTCd0YFU2{w)L6ndLp;atk`jBol_eYli4*_WNQ_n3I4E*qrZ1An$#QBr
zR1uVb@ci#n-3W=>Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8
z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4%
za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8
zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3
z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M
zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>R<QrTuv`q3OcIUYGGm+(
z(3X{=0_Vh;b%dMgV%*fq$%FOyTZ#UhJlKFIn!Ij2+yyl{7)NrWjINPt=hR_>Mo7<G
zJ$$eesV~sC$%y;`G%?PMxa%=y$KJVcmrFtL#GgRG(z*RB;kO9J=BXn1<iu^usq@Uy
zjfW2tVl<;z$?Q1Xs>{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0
z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9
zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});>
zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW
zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE
zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@
z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX
zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A
zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z
zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~R<!yWu_xJ*FUp
zzH<Iyg$^q(2!!p4!`}ja;v+<qVnQLoaVD5TkCPMUAQAK(gLeM#z90EEgX813zeuC|
zIK(u*<=f5{^Dl@G9J*{dLt8c6^_oj;A@AKZ@mICbZu&09Wp!u?>luf9XF8b;hei=W
zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L<j03)X
zvK6edo;fu7BE7MPpZvwU9&eScZ6+4C{yWA4TRZO|r1;TLso)hHCF2Wb;0~=cM(|*b
z#du)}EEzx5KR!CqKfe83KZ3Xd4GjN%7mB(jai`*!a6g0lFE$?Df-fAt$RE3u{OSS8
zg>$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi
zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez
zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg
zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi
zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67<r?xngL@EZjkZ;akq~kv
zJ#@|Ccj~Xcye=Fmrjjx|MLj4EaTXy6H$JhagldQDPjM-ZUmo3rw}{?yc*C8qzPvsh
zskkrP_?1237oP8LwCLS5ergkbH1Dnn?<V+spy~7i&#e=iR(|Ey2<k+=_W^{^MPK;r
z#+Cmpn?*8YmLj?7@W}7N@0|+|x9RXcZS(jyREOUiMpl;k;r&yb;zZX!gT8$NY1OwU
z_WU0v9{sa`<nJ?IzwT}8#;53KtT!L7`&CDc*uL@bGXLc@V%w(iUu_!y#^xpT%cH@K
zPyD-Q;t0kt{1$cANPO4o&*t&xH;?~n129Zg;Ai}v#P=&-7=0er6qG+-yzWodC7(R%
z^#5g-GSlzFZanOJD;X@53_FIA#IJsQ160?6PkaqoS0RgpNOS=6lO$1fY4`&7&h${)
z9Ox67y_sx!x9A&9^@!1-n}>#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd
zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9
zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U!
zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4
zTh3S5dcMN8^A)Z*Ut#<C3Rliv*qltn-KEW$^etTieCrt;Ob;gehfSlu2jWYKRHo<c
z$=-C&ZshGwo1)ts-f8yiHZs{BGn-Bsy?c%AnY0Og5F65a2Kv&*HKV(G(`E<d7>3Fl
z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y<r_bp-gzK;HmfsWyhogMo+dOEV1TT;8k
zvOc4KuxF=Xrf(S?Fw+^M$J{x(J3W*Ybve;Tif0TBXARJvNgFq(_Zt0B8#GlUmJQ5S
zbJ_5$B|B#=>7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt
zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*<qu1U?@fRzF`yg
zni9Rkqxl(qW*RtwA9VDf2(%cxQll9c`vCztq;n|16Z8beKbRcqP-;ZzGkb^ncA3LN
z1N+i~wTCbc_d<F0Fva4^fuVuSF6h*)1KC}c7ZG*4MyaG{&{*Hy*w~0R)BXJ*)2Q1i
zZZ!wMv>~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{
zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3
z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy<HaFI5cde()~T7gITJOGBWV@
z$mt#3Y3v4Y&rX;hOENP!yb};G2{;KdZi8Hg(pRv46e23XHUPmugV{uHRIM1-c5S_O
zL)W#*^_#a|adq<Qu6Hz1!^2A3^I_Lt>3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q|
z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv
zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os
zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK
z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwH<H2~^==*y6iB6F3SkXT^
z)R!GL!J_=$s7`iur6fJ;^65!5!3p*yl?`_HZ%b}Vy2A%cK8d(9sM+Q5B;<kryZZ+$
zIIDkcGHhx~rL!0q6LAcfqr|wLK{<9x_?_u&a(7QInc39?wk3D-SeG10n@Q=SIvlS{
z(aKfntQ;U^ZGjl=HP;8Ifu;=E6zr3SRWv2Np~R?$88~ow6^!i##w?&=yU7hVwYIT_
zXQc1Vq(|UhG1B&-v_V=D+^yT6!GR8?;XdIGQa!02qZcNRw&Bz0a{Lv0y_tdX;9U5W
zSu2#Y;8Q@dSyy|zd(4~F7^H@UzQHsJqW}-F&oy2t*c&C<n;1BczUg>LOECuIRY|3L
z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ
zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?)
zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D|
zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u
zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC(
zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r
z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP
zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4
z^^d9gBNn}dgsF4^K6@%}4Yw<KZePd+ryWr-Gaurr{iDfFZmGLR-aA$L%%8~gq`Dg_
z|HMw2&z11!sKPU*%6BL{-70-r>3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M
zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4
zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS
zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP
z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi
z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ
zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H
zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta2<cnZtW$YOD|fO5K+ooK_L-Vx}=;
z5j9m%MnQj)W6};@T2Y-%V<A{xs#Q0f)TvZj>0p5Z=GT*pDOL--(nA)-%6cueq44L!
z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE<i`LR3xHA-b@%P+8QPgP6uex6a?zvk+SQmv
zrUY9}%jEg(GaN6KCBgsIunc33aVtG1Y3f25ilG;*w~87uF}WNVa#cG|dwa8K)gYS7
z>{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y<M^<?x7`NYq4q|!|Ej2=HhE1{Iq5*
z_ooU9;S2k4#pbScZat#OAFoQJp%AkY<LF%;d;xNDuF`8c{FwoMW(Fzx*>1>R&SG8`
z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&>
z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x<weS39!bw7%==yVYH*A8vmv33J%*8X@d{
z=g~1!1*-u?x2<oI@<N_ECU8D@ha_oY)P{m*{ZZ>2q!NbAud&FO4K9mam7WxPHtAqz
z#8=Qkw*o78Y|FR<GnS0opQ-YKeiI^PY590&fOERcgkBKdwy?Z@<<Y?jj&&U5GvRgM
z9Ha^kt9+@XLMjJNqh1A0B+~|CTi11OTeo?M#edSv4mDO9^)zq}AimtTaZ8fczn$^A
z0FENYJg^Jn+kTD0nAiVV@mk=jUp(;55N2ELLZ5{w`qtuA|Ld7&R?8Ly9V+9?O23hJ
zLi|SHL;nVb?J!pLRz|7T5W&qTxPoxbTBBtBWZRN(Zug;ZI$1N|rCo9-7Bf?~nptDD
zacN!OrIx`;yb?~CpPcNlXXEkD)&2}{t_Xzy&ySCsfNWX9_3cU{exuEPIOQZqA+*$J
zsJ=N-j76@(&mo4>F^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0
zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx
z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi
zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR<eE8C
zu)40#QUqC__2B%B$m`!M@QNX;v1ag^@Z0RT-U3&0n~wkbb=O|A>6)u*a4ds_b0cO5
zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+
zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;)
z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEc<rro6}t|
za(ai<xerTWC0#Srw|x}ZaiO+Z`gXp0Gy68#o^n>sav^{*99hPVLCFo5M+(ia+1be+
z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El
zi(Uyw+PsFhfO~uTZoU<vDD4x<6@j{@l&EXLzl^c{x+_H8JC@b$URIY{R=2UEZc9hq
zHlywaWubJNT)Xkp|L~_TeQO`yt#z#vQ^Ix0=<bb)!(3?j@r{b_Xwx-`vo%<u#a|wC
z7{8TI=YDA0x&xcxq#b*4@P#vIX$m!#t&zu<)OI~D<8n5w^13rs2W-4fyf@>N+=g>S
z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^<hD&Wbh(-1
zuvKxvMTd-~9lBC-<<SOI4-P$&{j-)ke9Z%~>ocE7l6l2W`!0UFA@b|9FJnGiFm|~+
zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*<o=o4sKvL--&#9$X$_()tBb>Ia
zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+
zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}<kEN^<jpU;3U*(n%&%P<xtH)8%)wIJy
z9DpOn1e=ceA!~`X=>opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@<kd9BWH
z;GCM&!jaLz3?eu1pD6GC$#%R75fPKL>4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe
ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96
z$I+*uR8BMw?P(ks>L2C@Gd<YY(?<sp-v9)Xuvz{#A>YHhllcHPZ*)BvDO$b=r9lZ1
zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go
zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv
zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG!
ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+
z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz
z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k
z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM
z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj
z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk
zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^
zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E&
z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$
zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0
zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A
zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5
zJ?^Zo7Cx+!Y>iLT6E~R1pC<qQGwJ*tsQlCxBt2q5ekM}%?Ld#Ho0k71ax~=A{u*7@
z{U`ifM|*vIS^0iE5WKY)XXP(e`PTlKmEWoItvyRCpMGGT@RStb->34eJv$5j|5o{-
z0{FjE`6}C{|5=r<j$hdMKUMkk9xYu+@>PrtPol@#pSSXtsC;YB)XHCp43DT)wVm50
zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS
z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv<a@-t0%#$=&-uo*{3Zz8BQBYi
z|1{@YyUb2;X(_?iI2r2cGn`*FE&penzi3+ib!CKS{<Qq}asJ|I`G3#(mrl$7Dd%4_
zEx$NS_${q+iZADUR4TY=A3YXv>~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3c<T$2S
zcpUtXMGS=%XA4{8<G+E=v53RZF|<e50QjavD>KyJZ&H55#ZT4lZRk=g;_z)ub)C)=
z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCN<b>J&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ(
zb|)1-rnUF5gm}H}ov|8H_YuX<Y3-52#`SvJ>$V)0L?_NDA<On6$Yp*S-bW-ysl9du
zOV4zX{O*C|3dI+s%(>(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq
zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y!
zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7<p8?;1
zyelam@bpX<!Q0P(x1RyO6y@ZB3EmnR2;P1My!{Lqf~SjM=-HrWz}U~gc^UFbC?D|j
zOc%l1&w#g|0gqP?@!@(2PtSA_y!{M#`x)@0!z3EQZ|O6^+s}ZvpWD2MX#l&z%V!Ne
z7flK80<V-oL7BHG$Wg)$%@Cd{LwM&=FDNF01SV_jG-pos2#KpiOrn8K*upXV^O6@p
zn%d-c7fL1a&d?S`3_G++v{h;2rt_B5+gfM0b(D~z9kPXKZOzaIjnX@`dbCk*x8W5=
zUK^f*{^2}eUb#jm;HLGzM$zPDf9gF?KWW6?p*4Vp2Ac*SNoX=^r{5@=G$^8!OGI=h
zAQSwHAkc`zH=d71a0E0)Kx4$F>Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob
z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL<Dm|$tTvqvMmA1+a
z@Sm;@{3F?0aRtHCnddm!CN6@fbVR{ZIr(VP5x&%&;N1Bn1G=a<j(^>*G>r*oR6V*;
zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?<
z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|<iLLs4(PP8
zMGHM){5u^MTlQKV8}o?g!{REWCuvc1>MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$
zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW
zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q
z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U(
z0C?~A4A(|E)_+hR_<Xc_HL`>81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF
zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW
z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf
z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@y<N`_Y?V!{tJ(|N$Ee?hlyvx
zNBWL*Am?Xz%98r6%2gWrE%A|lyX%uae|x(A0)L2>XjngCUsbsv>!Y9aQP!Wr$|Zgf
zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf<Z(C&BO
zo!pWm?9bkFm+%53^oi^zO8s}}n}MD!irtp}N6e6-@8_^b_?z2+-=YuxsTTG`{Pg&-
z;{f`j`v4C8n+lFediHrjM2{$egN1*4Pe$5@q^CMxf1yu#_!l*z{{g4}us??$@?T+Z
z#0==vzo|2zkNuZL5Bsl(Ll5`^`f5#o0li?4MKAD2ZF<2!r03>>zQ`W+zV*3k?_BsN
zO7q3=z5w;lPyGWwsedeA;`aon*<Hx~{=YxITxh(SZhvZluMRzg{ss4fyOm$+<P9I#
zUu`zX=ELI;>1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X
z0qAXn{Rp-n57`gKyQOB3i<rmIk0|V!D4uFR!ul4&_%cTHoWOVreOPMx{RGR)rst|L
z{&+pqzFs8D?eVq-nY9>i-Qx%388u6IyqSty_<bcOEc{=g@vPKK_<V#9>8hNU=`B_E
z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P
z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz=
z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@<RS6SU!MXY8Lrv=p>p%
zDuVv|F|gyJ{==Wt=0q7RMTB)pKk<j~mryzQxi}YKeJZ9p@Tavo9~?gTP4JK}+7E2s
z4xh9h{*>%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw
zAO(YYOaD>A<t#7Aqc&UW=sWz|2I%uqf~DOH;42*eqZn^kK9v%GZ8k8)X=)GhsoR6`
z2;*6R`UiXmGG&t8KgN#LgJh3kfpw{MY>zZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN
z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ<O*(#Q1BXInaYx!#*LDrX`(8que
z*T#Dx|44-7OaBCwn<YWD#ewohDrflv31TePr&3#=o?Z@ke+k#6a?+=LP(SY&@8xnD
z_-*@+Erh=Kxqr0RA-+;g`r)_rgT`yfdyLBHuS?z~Zh1qU9C?RW-lbH|@-A`stK@I<
z(~>_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2
zN6FWKKV<SmC&sI-!T^8n`t_o<g3^e$iC<$^ef27~r>}&CFGF>4B@FhD{)`F`Dkx3-
z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hd<Cinh(&PAJh9&
zwr`*wyF+{p{)OasKl)>pll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO&
zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO
z1<Mb8$rTrq`LTZje<HacIb5Xwjr9%jS}`fy$hGJ{$2a)JUZlhRdo_Xatj;Vcp;L-X
z-&eOHK8muvOr7~9`;*B2NGanxcSnf)%X=YDwv8aH>Ku0VbJ91<@w-}FRRv9;{?dFG
z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>L<SGEMWKg3L26dycA{y6GGg)ytYgrDIh
zeu_`a?Eb3yB(bUIP#<Z~AB|KZgXsTP4f2=tN0q<4Yp<e)$}xYrs|wEe6|jub6G-pH
zNP+>C(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@
zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX
zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9<CWt=RM~vn>B(f8d
zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$=
zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1
zKO_FD`ALZSKZC!G<iZX=(Er+O$<+V+#AT~3doT2-wOKK>ZQYhft@2n-%9r^Cc^YO!
ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a
zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl
z!wHWOt~7@<oR-~VA42JNN{>CxS^#=^J%<!p$%DodJi~uG^JTaHuF4fT{^Tt5HHUZy
zreXEB(4SZ3Vs7|p{(OF}2q&i0Mf_9pxBP!BH;20F_;<6M_1kJMk}EB;{T1^+bOHZg
z<oJKUSLG@=T}u9e>c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R
zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}<Q3UpF6Rg5und7TjR!32*~SA5_wMl^mW#!z
z2|xJ<&=c7g&hzIOPb%F08uZ5U=vDsOiuciexZLvRcnRRHy(+%2{C5c2o9Dj~?YjMU
z&=)g^9vUB>BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz
z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l<I(x><Xdk4O#
z8Its=`6B2qQvAbw5$!FK@l%NWCE6qZ&;Hc*|4P4X|L@W_=wqakmsh2Jk^e`33BT?C
zE15I!=UBf1J&r#gVSip5lltbP{=k37GMIx<ee&<{&-MGt%JThpne_j*|1JUm`){x1
zzq!8czl$9IJ)^%a<M}p^U+}kedoW+cdKAaU;Ah<)pm(JYWH|hh_Jw%hKqktrj_vJi
z8I5<+-W+{V<D2xqh?k3aex>|5<e|#deA7K1Kt91-K>1_vgvPgu@_c_=A^hZT$tKA^
z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw<ah%4pxlC&^J7Z;18jeU1pKhd86V`0
z<Eglmzb5_7VSfOss9W(0=A(?C<jefj{j|Se@z*e;?D~0pJ#_^5OG^nQjY<OZL2ss1
z`4fZZ3$z}_{)N}Ws=@!T!+(|sFRw`XKps5*-QjL87?%FLljkcK&v^9Yyl0M}y^xGQ
zgLZp<_8`38IemRGmMbE1I@NfM@ty*h$)x4~W4S6+AG^aYH|OS!4_FT-b?8+452zn=
z=EIN&;stUCxS-WYTk*ezKaiUTZ#P*D0O&uhe`(^VHC`eO{D{EE)-ZnXzut_L*l+z{
zz8}bio%*nkV5VHg^W;Nee@nAPXn1mMjHuxCmkQQj)?uu_)S5-yp8(01*27H17Xan>
z2<<I3D`FK$lg|^UX#QPRBn5^vruy|}tc=R}V*F(hg2m{g+KW{XJl9A0Wy=5ad~)GJ
zjt@?df|0#HkN6h-otnC1%}A%RCpBNP_=Wz@Wn5GR>vOLDvweY{m`&eCqOVL1sDS77
zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN
z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr
z<=?Ml{i!u$dHPA~g(M1|zrx<J-opJO^2wg-%_1JpI1PC&HDlQLQT><vD6`WjC;8>`
zBkb@O@R3}M=Qnac5%YmuK&B<V-bgNj4&kEsh4~}Luc-fCwhzcZK=h8N{X?wxV}QYR
z?heE=&~Gyqw&nj6rOTr#Lg4vk5bHmVe5gDcP!5XAmzrgcd|AJXcz!J9&HXEbZMftu
z^_ls@@(xGYS5pS)r}@TdE|l{PTfa|3ehbT3zaOy5q2IwwSo%Y<PuOpsKfw5a`aIti
z>!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{
zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q
zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1<GYsr~LD>THj)#3xD*Gm}I
z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^<x-8xxQb=#0nQwiR69M!uK4e
z*}!)x_TNAGiBDWMMdKy>?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z#
zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV
zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF
zk^H|8>sb`8Grz&Vqh`<xipSQTk^Dq^JfUR%bGh8FS$pQ^%#T>E&BOaU<WI0)@&Pjx
zq*!Kc8Nvdzhw}y;uXxma3-~DVw&i($jw5jq(j~v^fR8xVM*52SM@@f_<xlN#`+*QS
z<hAGSVEIJ%d)qjkirmTN(fz(QYJWw=5pMs{eJG{<P(p(Gbvb`I*_W5_!#)pWDtL6~
zc$4FuxpD)B*8e8FD4)aeo}B-cF+G&Q{pb4g<N`0jpY@_V5aRyZ^hFBji#qhd|JUVW
zOi#q7XMr?-!dFl9=@Ay7oxWJQTRZKE+2xB!|HzTyvh{0zLE0!tFLKi0&nnYfLGeQ+
zQ8BU=>B<V4uNs`@^0^f6csYGUNBLY@f0XcCzJT8ilkgstFQB(_UFA3+W7RK4{fJKb
z!{xoj?Ej9D!yx<m4)n)5Z$nw+Z_LsHeTTMK`u-HH9~ZIzA#k$ikC{Q<-wG=G)pPUm
z?BV`gG3#SM>EEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH
z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&<F24Vx6w^|IvNGX6bKXf6)Gc%w>-LF7lv#
zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC
z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj
z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM<O_dyAhXEDAEMVK
zZ^BRf0!56U>8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq
z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7<d$AIj6uqgXGb8Mnhfp2<r3
znZDwn(;s|^B3Hrkwf2XPn#BeB7%ZX@*Cp>_TmP7UUO9bY{w#((=NHghRN=Ho_I8m&
zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^J<EX~>TAoB%I6f&
z<8k`K_JHY=4UaFI<mNIx6+{oyACS{orl0KJH`fiXn<aVv*O!yz$?>Lont%4?Wb?9o
zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+
zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s
zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{
z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A
zsWt5K=Oz6l5EuP!dYFD<sEdAw9;SaWM+b~k(N7GP@*#fU(2I3R`W^bDd<?5tF#R=%
zxYecTCx$xo8!DherlOx1>Z0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO
zB>fIOOn;^H_rM|bqhtXw)S<tMB42qa`iY?~`rY&}{lqXyKk2)QHwgpNUqi#Yyjb5V
z<{L;k^p`vIF#U!--A8|*Kl9yb;HNL-sH?Y(>~R4x)S*A-(9i8hF&d~#*#j|{+wUiS
z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm
zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f
z4u9nOECaAUl*CA2vj5f8e`dJT{}@eY<fZyg43YiciuTKWR<URx`YI}kAzXiCImwed
zDDhte{6TzmL6yI=xG=ugr_W~rZN=kHQGAa%fYl)E2l2a)<2SWFK=D4MIo{{;A}=YB
zwSI7au7v0J#cF<!_@IdRV~xKbFiT{7K=W6wALjEPPW=d{OK83T{$l>ClfbO`0nfkx
z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb
zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89?
z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA
zDVh(@<uu15Tz{dW?@p%gK*sOHGnY{ORjt~W^ZmNqe6C-q;;RFh8XDeJyy!!J_;iXZ
ze)Lv4@r?$3DtG2XXy2EMb9HAv-ISNc{7CwD6`$Roi#hYjr)Yix?ppc_e3<Xy(1fA~
z@o-&k5!1Vn=TB5_EU?S#iC%X+M)QTJE54+3`64FBj@M#~oca{M=}x>?UyiiLiRU<e
z_T|N=h4I?a9JZ&SQEVT~4_Ew#`NUDPVlBzn(wD;Z2dv-ldbLwulJD?Svd>A==!%Qt
zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j
z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz
zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!<v>-b;Rg{<_>MhHqE;kND7u
z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ
z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS
zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L
z<wyMrFO_qDCaJ%bx`8y)HyI9r9xtVtp67`kO5w8Usnv-A-2No-$A}O;2J?&XHR!|u
zruRjnPw%wtx7Gvt3@_DZ__KsxiHGn*-mvF2Hhg^m>GBSM%l<tm^yvQ1HX09|^^<jN
z<Um;;dA_&46G;ir@~n>#|LSdiMcVE1dXEnOv&u;m{q<g=-+}jaIP_?M_qID}@<;7X
z{}wHWzIL#_c96bee;`laPaOe#tJ9weIv=pokq^@O`bGL3ThH{-pWZT!^pp6<@@4*4
zL@3Sj1V1}WpA)}aM)<)$>O1+<S|9PB_fHMVn_Le0)%vNwjy}a4{<FL){e)2J2g!Q@
zAMa)TVEyy0vhmkz(EoPHU(})X&7--Ba(lf9{;$JqC2`2>>=F2*IgY=T{-T1$Z}t}$
zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^
z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I
zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4<YFY*s0
zAjdx}V0&eGPllm?1}~Mf{;+*V9DP0)1pi(3^WsHl-{;s1?3esK#oN|+K>mu_Q~nC@
zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl
zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s
z^^QG}y}0bT_8Q3NJofnFA=G!-<HI!n${(-t<h_gh&E?LwMkdOj4;@mT)JM>NG>7^S
z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv
zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr
zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq
zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb
zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B
zz0mmVbL<iAMfb0F;%$t_(fwEC#m9N$(K)iOILBKNve6e0A$}@%@IS=sr>rmRA5+D!
z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M<qXU^1hrjG^RyzKc(=n%g)F=Ny
z{FVAk<NE^EU*=ySzb6X)1H~Jz@p%{dgO(Pclh=dylKeRSllE7Qw88vfeLlMs<FRWz
z!Fo)G>2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e
zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+
z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j
zbP76OY?%6-ISh|9)~jmF1q<v64A;N-;$>WpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ
z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8&
z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8
zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{
zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g
zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&<wJRsE=`^M=`!LFYrN89wGJ
z*YW)fZjUzUK%aeHh|d#k#rc<jpX0AD%JVfy;{|k#X9nWsXNW)ZB=3nn#Ao>420q>(
z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi
z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjko<s;
z&MS>s_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS
z!&F{gAx@|?<DXk0&RA(GuaxUu+#a<*zk**Nw$lqL#8CxbPvr|M_`o2QFCu^B^HJCM
zg8c8^Yg9l|F8UW$@JBBhz6|hz01bV7;eFV<_tsX3dyt}X?$0F^;*?5bJgdR^Y<t1u
z>8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n
zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX&
zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L
z&_2i>9<cP4^p~8gmB#qX`g;cWh<_6#e_C&b2cmSn(qBWRS^uEFohr@phW_qwmqUNY
zRGQ@r{e8e)4*fl0rIr3ps`gnwp}%oe&S~f`UiFal5WUdfV=6tH{!Uo<2tV`}>%LMy
zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5<dj|a{{hUpI4fMyY
zzdQ2N<)jZ({vXlbl{4tCtxrmS&nSM@EB&R>oAsOY2l{KMG|L<M+vzTc{_ap|mM`>o
z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D
zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@
zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ
zI}QDfD|q(D(BBc2o=tzTt|;|i>F<O}FPcSvaZbzW5A=7JO3$Xh4=H?>{+?6m+4Q$o
z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=<mh1L9eWf;)8!y@55Ch6I;LU`QPe$
zSwBV;i0^mN`_*F%52XUUGk?MRC-^=_;rF|oc%90Zi|Cy`p3n09<23YtT&-ua|HSv+
z?#geE@S}I~eTy0J$M#QEcu_v}>KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9
zckiX+fPg1{u>E<b#0HgSee_L<dsKRY^dGPLKBLlXKlpwS?T|1(sT|+0y3R^3L>lk6
zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6
zKc#%WM3x^xIr<aFjE>r4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$=
z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_<rJU$
zltL5y5v2XRf2HtgG=Bg$<i+>`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT
z`z6-<dU@YF!+fKPz9&Lg4gO(&8T0L+fyZK=Z)3l4k5{Mtcb;#fe?0$wyOYNL3X~Ps
zzhIUGea8MQnVU>Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx
z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX
zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT
z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7
z*7;#-1oEWyBEBe%^`<K@pOe=KeqZ&C{yyw<{ZrOQ`UL*xeIFF|ZNINY`pNp{_ZWB#
zk-p)3S5^348p(^lHwpR=><#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d
zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT
z)c1U<av?ftfCWDMWqjT|e17ZuW)+6C$DGG^8p}&+{mNUFD`xsE{kb&f_1$Fz@AMKs
zz^|xT93%^!`h=pF*2AE%%rD|6+9!T_sekA%t<Q1)-j4d54oP~yqv~h9&inBwhrJim
z{M&wi9{Tu%e4mf`<INQFP$+_+ALBWF-+}Sd`XAQk;15FNADCXqi~JpX6zhHYz4Ci;
zmi*vPLUdk_;lVHZeiGTA{J!W`{vH!sC^drhKz{~zx`=<!7kWRRFQNzUi=#dMKH+TN
z<NW{h`^V6K`rah<hy5?}+pB|enWpm`^!*3;Q=G?C?-wY4O8d7DmO=mg&ik5dKl%O|
z@eTVIUhl;BJ@I`9^}Q|lPw*G(L;QUv8P9DM)8e)J)$j)zcs=rdPqFC4Go|YhPuw3U
zw%7M|Q5vroQirG<^wr#_*1nOX_=(PQvi*`if?g`;@sG<9znLCB;LG->%E_MiI=7Yh
z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6
zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw%
zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~
z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8
zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(!
ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h
z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK
zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+-
z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{
zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M
z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O
za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk
zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE
za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z%
zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H
zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN
zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I*
zc%Mv<DZx;EoFl3nD*(Gs$-j{PvG*%Ls)&njP|3&`C0y4+CI4ykACW*~O5vzJd|BC;
zt1dl0jDN&`)eL3H>x1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^
z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M<j4LG>&@OL^|A`&}|+-QD$A
z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d
zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w
z`i%O<ksh;Wx6#<!++<Kr(oARf^bE$A7<)#{fuU@FQGHRZsBLSRE27DAu{IfRxL5?c
z6GcmkE`JCgHg4QGINaMaXf*Z=4(uE<62R1MByYH;ExXGc&1Bm$do$Ve?zU7q+cPkj
zh-cEbjHZXO13iPyc<j0{ant2jh|(Xn_l>4{l6|AS%^;@<A0^cjO(TOned%4pgDLc`
zDVb;!i>jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D
zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y
zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55
zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx
z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~
zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7Rb<j=hD(%&wk|!t;>Vvq!7C
z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH&
z_<<&X?;hUMIQj$evIcI{^+5qrjk(K<GP?$UDHgRhED(RydU;Xbo@`HJcKE2z`>2St
z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6%
zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2}
zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjU<d-v&UG!KAvp)
zj;5FC+FM1UuD|JPBG!CT!xukd&GW_jrEM)YeN^ZkEz&ZE#R^f=-(+Z^79oDCb)w0o
zUAnkl)0+`6YdF-quoX`?>Y`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z
z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K
zbet&rIezk7%wPFwt>pcBH1Tn<pyjYG(wg_LMNJ}cRu|Z_epp082X*a)C{JM3^MtNt
zMKrZlG^Y|xReE&jmwEzxNeH8<p-JxoQ<^IE>K6Q<bads1J>kV(p<kvK&Gkg%OZ7<V
zN?m*pO>5dWMWnw;YwAzLzktoAp?*!=rZ<J^%o-uSBm6yDH1UvtRf;}MT%qe*kjC$)
zV9>7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA<aI$5*at*NbS=K8;9fF4wBt
z%HiBUBEl7#XYJBN+dNIv*LzClY2ujBHjAwgdvm3xi*I@({eLF{m6|B=X!AvJr50U*
zuXHwHi765H`aY@05?|KEodIpV$MYG@TdDOfO*A#Nw=dMZn>-8J7HZ;?;;kmM4>@yM
zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp
z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt
zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{<nTn|4)VfW$of-L(`uW=<7<esm0KuZ7rx0
z0s{To=C*b?2u&>YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE
z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9
zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA
zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr`
zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9|
zL~v11|9T+2IOy{{<QHwF;zqCU8lk=9gYbgi72;99?_%M3msmVFe7}e$eoY}ueM|e_
z3eE6|j|Gv^vjN|UU^F4L)3DxUiRSjUrg&TPiXc9&S)~00l|q{55)r&c)3$rHgkSXe
z#ch5apXA(Gs)H||4ER5!X@|U?n|w<X?eD>>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG
zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R
zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I
zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENO<Q@M^ktsVHy0E)<9rY2Ogud+@?)k@i)%8SJY1
zOZ{p8j?s1~O!MCb#AL}^F`)GJUs@#QlKilI6pepN==fcO&xOPfL^R|tjRW)*UrXpV
zV<86Awnaj;`^3*Yad`HCKN1QTmEKxf)7&zzRJ&T&|3>KBJkiuYYD-cpO@B?I?XTea
z;zRz%OyWyIUtd!4C87OBh|fjzw-<Z=M);7q)hoUd4C4W74~zPnz9fPl7vlTEyVw{0
zxbVc_Mwd5TUKR*NR~)FQOPj@ki{r)64<W+AfG@lhvE9QJJ25&B4);A&5o`NcMKn2A
z97v|pBg5II8$TA(f9j9)en?xotvf6#Hb;c_gORx{ulmD@fNyT=V)zDKzb~Tq!-e*X
z`u^1R0KVhvzapUhE{p^FS8way-0*3kJ*&lfzAe0UBHC~nG))g}2<W#3!nIl;oC^fP
zEudmF5MC4TXn*bz)dB5sq5VpT79Ah2720Jmt$@f;!UHCUZw>gh`CjeTK=F?w+I61N
zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW
z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP<r&3$Mdy_*dv<b{c%nBzFX!6+uAk!CRK2;
zs2w&t%f$zURvC?qe5!bE&GV&^ra94DFbFq}4h>*iFtI(d4k2Pk%%}f1zt*UUPiPuG
ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N
z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ
z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?<dPryg0k
zE{wSHwo7Szy}i>Traa=`eLW2Z1+Ux1oSsjTor<IfBcXVq!rMMaJWzZE0_%|`UmVfi
zLZN@7LVwPyE%xf~*J!!<-QJqejm2V7eCcj~B=rMPipXPLu&hN?BwED0_GZzHfG{(^
z2N77$;9$RAj?Y89bxDI!v?t*2?Z>^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j
z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>Uo<MSagy2tsZU{b4&lU
zG&~|I7Bq=6BrB?$+ePiXmUgkcxoJeqt8RT_o|u~`4tTbdUDDG2jd|kJy4YHFNqc)D
zp#MORuWDP-T~;0n%=PYC;CpIb*rSym>uSx6?%s_UKi<?kC%j24n=`OU_&%Z|(h_%s
zv>!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8
z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv
z!2c`}NueW<{%MK-x24*KfEXwXKQ<Sk7Pi~2D=T_Osi&k&uP+XKM(98AN83ikg4XN&
z+E%~VTozgRr+(jG2>tVTqt2)OKkU7CbX3(IKl<JKlsPk#%$bs2$RvbN1DTdgs0k%>
zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?Ojw<Y{~Dl&zu>Oklc6Q
zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln
zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx
zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9
zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP
zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`=
zK<<deS#pTHkrIjHvrK%><j%+^tf)_<<T^0I*HKv{>^rMK(~DTP{vGAG&wK~}Xrknd
zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n
z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L
zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3
zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wR<waKVAU6JEo
zf26b|$k`yQ{S6{Lo4NG&sX+e+&$eltl`}RXk&3vk@qAeN3B%`3^nz`@>E=WVFjSzg
zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr)
zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB<M11*_kjvuB-2ZFTNbgGm^4<Vy;=Hl
zHP7ukJ=N`-oa*$2#NjM9Tz2|COo58(G0gJ$|B#Xbs_ODznWqou{df_Cd*Rh!psx_)
z4a^<5km$8!N+#ub*$#gW=b4yhm_zP}9X>6Qy1ru~=%J0|`e4?Z0N<c5IFi%ZO#hN+
z+WH|Us#1-t_v3wOZeN;^z~{)cpf7fUv7J2E7rdCW2m1<_AH`7PHzg<7st(Na1^<Qq
zz7_e_){ov!;1`lBG+J@`zBeQ<q~y@Q46pS=6r!(sXCD0n-uqbY`UWCC{~b2?ET8%r
zKGWageQX0beNIxN><oRGuHCCqK$4$J9O4f(%6(j+YBY7B6DZFYdWTU(BK*l0%G{Oc
z_PG=L_=<<9`tJ#Y+`f}Qn2UE_VxF%U$(sgsI^GNjm3(1%t^%HWJc}me)3u1VI;HC?
zJoih~a~08O103PG@E?1V$xX68fEVy9P$O@3IDO0ULjqzi%Xz+%3hpj>obi6bsccnu
zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS
ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG<lC$grH
zjA7>79CeUe8p0AVPMJpz^l-115-}%O&mDtM<X@3c-Yh5d=P#iznWT-(rGz0APSfu#
zV6-Y-xjvWU8og_=d+V7#>Oa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx
z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}a<FqqI@d;5V}QXp?z<ar)6+aQTdJF%5MWEP
zoM&E>OJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j
zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i
z-n|}I<U0u!e3nJ`u^jy{<#^6~mvjqd+0s-tPgQa~IsVbOvpk3Cp=8AsyoB@RHfQiA
z$=TfILme4RQ27We?+;pqh275hTAQAYx;*qN<8?N^#739<Ar?9Z^^=AFLvx1KICbnw
zOvH~2Kf-@dx#|kOrXtM`iO;ucUL@#Cb_Hz~=+Z3ES-C5ChlW9a4$V&1KcGx~C+eSU
ziT-VM%ur~ZNL@N;7vDA}z8A{kdxKD``xKR7DzGxdylA9@L!a$W7fJfAjMN<e$Cwp)
zP0~kGswWWfOEYBB?Ai=jrA&T(8Xd6d3oyGgI#a(PUE+S*sG8znE-CzCWk^l&d^EiO
zVmL>}i!(G<z#ml-=cn_R6=}3e?-f{<C-4syc{^55$g)-CgPe|*Wd4GJ03<;1fg;zF
zt*9^mQBms2o=pr_M$e+e^IY7k%Cjk><SCMLM%VGgY)mt$d^RO4P2{KO(rii!_Sg8g
zjJL}Wg!L?iv7aq2->}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_
z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE;
z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP
ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C
z33S+w4oGh{6xqqDe~SniuF<oyk$Z=0{56ZbUdHJw{hK7!7G(DMHukG05ALSA@iN>r
za-N+($++Dv^e+Fe9=eBlf<yW!g<QXfW#eyA?#L()ocst4&M5ey^GhFC_4}zeM61-T
zOdltux&3K2w|^q=)YM$RZ-wIY56={V7=e}=beF#t<1Xe2TvDKKWqsI{GAv&JSLP;(
z7X$0iemE7?353)D1S}tx_k|nw>vHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3
zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C
zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL
z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1
zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i<xf{~oAGfq4=$%sI`>-m
zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2
z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8<?
zMuM;hxZx0zX10>fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB
zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2
z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)?
zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3!
zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac
z97E0TOG-g8$VVw2$WAbv(KagxP&)tA<?_GA;Xqt*6Da)(@|puTx%vc(5c+i?k;<Eu
zOw{u3$RMXT;=gwUQi+zLP)EZfun5lF+^b_!j}tECYDni?OPV{-h8)x8mfsqVRLN})
zJOEb%=eyi^4R;cZG|r|e=)f6{49m{z(<LZy!~b}W^Gu|{R1ZSbi~erpEH+UVac9l7
zI74Tm+*=YRr0{_@1u1b+u2oNTTHS$^Uar75!p(G#sTiia1CPT4kHZ5#m^s%5k7m0A
z%<T$9`an`B-o$y9*axIrocRI2i1f=62J%CQk3%BT*Y$EovJvT1k>t@AksheCB8Bs5
zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L
z#9vbQR=6bIb)c<t!ZqXqGl>TvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{<
zB@g8Pbh#t{bRlKaGDOObwYnp<usQm|_Rs?w3Sd8xRh?%(Vqr&;;YemQY14T-Qnuag
z4FrQ40%5o|Qg;RZA;Pc$uDt3sYioDtQ(`A4BE~9E7CypzoGZwt)K4-Td)4CQ`F(g<
z4$pEauPSs;7S;AbtIqr`Wti*<T{K=s@1Z}F@~Nd5i*J#Js-$sjzLx@uQbp1d>R;?T
z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR
zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz
z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3
z=`mZ5@K~u+$-{q<CR(6A(WzVlBaR$6&$o++p#2l=D>?19MZM-JxpbKPFXB(Y`cyhu
z5kV+sY-1{YEZN|J?Tp<nCUFrQo+R<}l4ZnNNhv(=JpLhLWy85A^psb+hS&#AT4Uje
zvIqJLU;OA4{tF|1X^|dvt9RJ{Sns}}p>w>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ`
zOZwE_@W&_><CJ97_kqip=$-19jkNH0lS^xg{f~i>K?8FwuOcOsA(ivjk8()vGU+es
z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_
zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j
z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg
zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8
zln!&t6nHwD_abZK6ha>Qj<Ne0`@3pgJeO@Jh9&Hh)#M4%On;y<oHN=I=DB?$Pb7`T
zl63I^$(}`StnqsX210!;Ud0_=pt2v6Xbh$oi17)t+{Jt5CR{no=|4%wyqb4l=^3P*
zO#Dw7>xEeJ$ti}Ir*t^hdrwY0g{ZtYJB5b&ii;LtVZ+07i53u_G~b3r)s!SYf}Ev4
zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu
zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y<rNA$>68i2k)}w)z9{EiU1c5!5Sq2C+L`
zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6
z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<dkYS(M{DiMYIqXBJOn
z><|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf<
zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0
zUs5tj6|2~xY-NbAsFy<QZ8w%{__tExzc7#Oh8HtKOE60&BG^nK3Qxi)k$#w`dl3;w
z*ua4Q0?sB23CxBD0z(~Fle|>+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$-
zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z>
z(OMU>SV9DGq`?-EC?yZUhb?keaJ<CFBnnXr=`FD0pDuO=Wg*Sj_h#nMvq0{jAZ)0S
z_;YsC-Xp~5kQ44@1_~|wM>DncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM*
z&%}|O%Q1CKNFOe*<Awl|>LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}<n
z1P2lKsltT_vcPnwh_|1mtl-a*I~26AbM2#$<X<U<I}1p8tR$=G@e(RS|6ZYcl?tSB
zG0u74p6IxoluuDX7)di(&J13o@v9979HdW19F*DF6gDK}+aq&&-VQ$}JgU*<!lPKn
zr+?cMF=sQzE(&wW4T?zLbxKz8dm2B`%Z%Tpuw-@~lFn7U$4K51mR0<e#vgE*0^5;R
z*9n2l;OEuuz*K|YM*77tcwsT(Ix93^;t$8f1$l^(v_mDn)u1A21E^EzYEhW8iZdiy
zr6av5H4!tqq8gEXEISCoy<9{cCY-;An?E~5qK7btKhVW+$0hQ@5hTZ8IV*%@e8y>d
z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd
zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0<XiX^>IMT1GVbE1245kFbQ
zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ?
zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz
z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4<KOyB?sp?
zGQ$sOuHe;P5lm#Uy%I#Y*T^12?dm(2%6dK^cvq8tvrH+$xej{^>Vu<(0hIWkISeG_
zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ
zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5
zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH
z%5m^9ql!X29yH3aDCp<QiNhsI>y0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps
z<q6DwS+oI)a-g2h#4sNNpXs9PhX*Kh{z7dW_LQhHzL$jgzGPNwMZdxJSTg%dhWc@m
zzF%Ur+d;piIE!yeRVUQZb7^!#kt;aG6%6`IJ|yhfcvRsJ&6Ad-(GH7lOUq;fRprBE
zHcaB<NqT*jTA!SUU5Tn~ITjFYIhD3FhqU=5_6}JebFu;%LVQ+AA8gQM`ToF{ST6ih
zb%wup%4ae<fs}oWrpr`bo*a4#t2SF3k#+r&c5@1SYf;9N<!MF{75=q9&3G?P0}|<;
za{8285c>~GyqK)-rtlkV3X<fcEXN<5Z&SU2k}Riw6y5KyQ}_!66?Wey=o?vo$`PKC
zjxzcF2Fpe?<-Uy!E0|Ilv!qKj)2?S_q~z3PWfU+?(f3vKr4!QKMQ;}Hze@C(9a9+2
zKwBzHlay)?HK=qGOA5T+*vA(bYh_;McoQv=UDsHuekXqX5S-_x7nbU?79{e;nYncX
zhboB|5?kZeFQindGR1*7`9jjalpOMAv>Sa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db
zCQP0!=~*nr75v@cE%O+Xp<e|LvkpR+Q@q=WBDPPk(%G}t#7l_2@#xR-bXN&}lsq0?
zj`5h@ec1-3z8@o}@CZGoEZuH5l>>c6Rwt&<lPF2cDBU~8@~hPud{tHY6TLoK%O=Vc
zV-~u(mrx&8p_X$_qf0b89la5p&wD!w8=V3}(G^RQ^RXSLrk8{X$}$TC(*xdocd$Xw
zWqq2s$#=OhpB^~1w>KEtVujihn3E$Ie$CMITPntJd1%9OXfL<DWT;=vnG^{IeNo0M
zIVcsYlP#kded?hd#4(eL)xY2u9=Jlg`shcoSiLcay=EKY51dhL|4H^^gF<!Xs@l56
zOIll|MP^pctUOZZXm8SoV6WaiR>$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z
zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii
z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K;
zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d
zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValazt<l|_~0QIvz#v@s34!?2(1$}IKS6WJ<R
z#-8%&y0VSf4w?P5(6$X57t&B%sY<6$Wqu~nV+Haw>R0UBMjSstghdsJ5T`|HT<mE0
zd7`=%GkH3{nlOBVnszF1&5kWOzo3>)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{<
zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT
zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@
z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr
ziPjej===WcG~#oI`<la{7Kv$va^WQ2zO;99xIvP93bn4Ot_c=}D#kq7fc;^77?bjd
zjnmi{laeCOHHrkfjhL5y)QTWhRO<Itj1C1+b&nfYKFn5AxAGiZ7ESaxF`P6SNM&O}
zzA)B7a5?Dh<O-~+cUoT_B#vs|)@Z>F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj
z9jyI0g<a@jV@bZB(O*cZorgl|3SxA8>jXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N
z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$t<i2
z!{YDquw%;VV$R}0*aFDyF2)u!`Kwd0KB<mr;;HBo1!<+n=^xuf7;z!eujZ0ylUAtd
zM0e>JX{{}a1S<UgiqbbZzn@z#I-5PEI3xGUV#;DIqd)sQtuuSmUD)=OGDMj@hjR1o
zD2j}(7*kqQF}D1=LDFxz40*RdvA>*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk
zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw
zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx<F;xddGNPf}n
z4OZrHb=`7j@UrDj{FN-vSM-isDrWYniMR=n?_sQc7F~kjGo$$f)fp#P9w)7a*dDRM
zB;B3K*EUdM7SCl$6LyJbv2;oc?{H%hj528}Hh>08g5f~%!6D8S7fN)?H1>{_f&I23
zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ
z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92
zA@^*>SZK@$XU!@L9U<w7LD<ZVqxUxS(m%owpG4xn!uWfr>Nx$dG2urK^;3Bs@$p>#
z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq
zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN
zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3<WL(icx%Sz0`GOmm6Y48p#jM(LW
zNv@WEN!YtgANA&!GI>YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV
zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7
zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%!
zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+
z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--<QfK20|y
zSl(y+JjT{(O$ch~Fx%>(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL
ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq
zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%l<hDUHNbAoiEd
zC)>qL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF
z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78
zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3
zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu
zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V
zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gp<pvtDo#
zXK&Xiii`70>zJF<L5KBf&Z@YmBOaEOWxbI5Mtd#TWE-4gOCEid4IM6$kSo^XHW9|J
zwe<@PJ-HP^cO@triLK-O`Yio78!5MBfjGr&-EO0~3Qwnup?f%g4|mne?0nKf#Lzto
zVQ1tanZIJA;ST8)oBX-T&nqWXS-rB=M626Z<%L1J!|L%ZS5b~+r&S*3bosXTW?|BL
z+3A)<DVwZUC@O{*Id*<B(TxdeuAO#d=wa;Dy~hpU)CWT)NuFfS@rCV)sU>0i5MQ{|
z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R
ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN#
zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~
zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D
zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m
z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g><SFj0
z(2hLDCFxC)uHtm_=8U4(uw{_gYh<(7SyL9OO?nVNmSb$WAA9g{+*(~`m(G$gEsgno
z`~g~8AyXelr*ida;$bevN9f}AvGPjJu;lO+ODV>b$LSV}&CV<IQeViIVrAvo&gNSz
z_W8t;8BUjAJ32kCIak~+;S#Vpvfb|V{i>?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H
zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo&
zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj
zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+
z^E!QFNo%*!DKg5lZ7<p?g_meHW@lKM<aYJ|Ntp9K;IT?phW6y*E%FSqe{ExzCRx9>
zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p
z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P
zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~
zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9
zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|<yUIzy44
zCU>aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u
zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf
z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUg<X`
z{e+5~)9A_k56qM~l=li1)G!-Q&|ln~Ab)wIa_fyhBm}Fmtcd+{OL4%tw!X4zF;34;
zJYi;0r7!qY@B8ZOLBsI`U-2|tg;lYvxx6p^=)%FZNL_t>i8%a>RY{2B@NiA-%DU>>
z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOa<g*jP5Ld=%oJAEP+4^H
zv+I<F7t+&QKY^{05QSqey=-tp)oPr1uJO93PQ_6a->w@_+-#@jr-WYF$P{~@;l(bJ
zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G*
zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3
zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@
zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@
zO<VPe7Ku-=Ico;n9S)1McWru0ejd}EH1lTKKh>@>euGu|$}Od3^S|lxH(uMnag^+%
z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;<oS!gvt$I(P`#)7LzO<&
zA)Rq0-Hx?&lDFHui)$}q(jPhUGpNRH0>K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f
zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj
zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n
z?Hnh90Ev8<m3^F^IFzUOu-SVM&bRg|#9kM5<j;!FS6SKGT-DejPT=@TRy0;MpYEcW
zm63_lCXccv%&{JJ%`B-LH|~U)h>l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv
zs`^z`r?<eq6}6R~?ThNFt6FQvZMEi%-)gPW|D<!}e9d|C?_T|D>r|bw3-qzJ63c8G
zcgVTJeR-E#Ru09<E8ZT|tEJcE2-n}Y&O=_lz<Rk|`p`x!9WnBr4ZRsf)3MONrrGGw
zRg7u0JDrb_s70e+^BF44VP18ZJ;^tOEW_=Xo>xNXg7(u#+L;_CeYH)JUe|I;yrfEf
zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K(
zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD<txhKcSjLC+_5V8~yJ#dt9(
zMu*YQvbl<H#&J#357-ns*=oJn%BC@CIOAWDESLvuiPsTnI$cNlRTxCn+6jHPa&ixS
zlG#tP$xkx<B%6cZnxW6NiPqOX58XDFh{o@`zOF!+>3_u%8pdQZ7`11JDWA`+_NM~4
zHNoc_*U;2@`s7s4xf55Sf?C<n_HF0@E_*Of>)O!!63;zj7NN(Di%UW%vK=0lDGfVC
zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy
z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN
zPT&&Y-Z5?MJAnJfwza<?=;PYjj{*->w6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN
za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v-
za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY
zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;<ic0RI3_+6eyue+7!(mwo}v1FL{#
z!25s;fyo!bJTS5e=7HOR`+z500`tI{OJN?fUVmN&^T5U1U>>*wxDfd3l`sz+b`8t}
zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j
z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D
zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d<rO*1`Ytd
zzy*JSdEnoH3xNao!aQ&(a3}CP;6C8Adte@T8&C`}Uj=%B^X`Lr;1=LQ;Dq~O9%y|4
z=7DcLgz^C#{4mM^a4HZ}Aao(HAMnpdkZ*yvKZf!M9P&h4`xfBPeaQE~&A<b|Lr)>!
z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO
zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm<SjO50?1M|S|fOCPLybJR{`8}8iJ^*|e
zxa@tH2et!`0x$Xi=CPn)IRx{-1;Dw$`+%!~PXTuT8~zRRz|4<f9@zK^%mbaD!aQ~%
zoevBHKm82mfxmtZ^T4%Vz&vovmq<6@fv?)y4+G1-hP)T)2%H8K53;lIurp>5TkT|-
z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ
z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA
zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@<C4n{Rv^oSVt3IhG2}d
zFLSIO>sZSsJ64xFng)S21#){K_o<K@@7Til76?v+;0`v{u@jOzSUJdoE&Q<tswT8K
zXiDObpl=0zNjLO6L0{N~F5>xF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO<q51>#
zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ
z3;cfIe<Aqe5pcc(@lcL<*n@c3%T556gDlu0e$Rva=aB!aAdDCOpAY%*b|brtkGd=D
zy%TcN;NO2n<q*G{%!D{Teuez|LLP1TB|;wl+#tfYUYh7AI>%-NtK6ZDjaqM%4I#$P
zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N
zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR
z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{<ikgsQdxOKZX1lq{l(ne+c#;
zg#8EN_G8|LCItRF`-j8+v5-$2k8+0i7ICs!#K|V9+~M7TBMJT1D-#`Me6!8b4-}+8
zGzKc7G4M59JrCwTpU~ERJL=MC`Y(t7CfMix@B9CY7?_1njJ40kx7;r3Biw95durO+
zXG4uU3Y@&mp^1DX%E4jKGm$6cv!ms!o4m9Txua87Do0kD0`o7!-km5H;vS|BWpi9N
zWplD)8ctw#S2D*$^Q9>3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si
zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf
zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(Er<dA^~FvC&pa=tvt>Y(qYK|xAk#7
z<ac5W_utk-*Ft`8_`j?d5%(`c-mA2=&qF?nwo6kSY2!syqwXzQ>~KyJm5+Cvp*`vU
zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ
z+W$a&$=iD<Z)m@{G#L4|Q;|dCyw|aZqc!Jf&C#yz6m>tIoAJ)V*z>5`*1i>eRFRL(
zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z
ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%<l;v1>FLC0O)}rU0*u
zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg
zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g
zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He
z8*Ib>TfzSf{286hPmP<uAN+L8Vf4g*1N_P0FY07|LEQWi@Hc}$y<<8^Q;b9~!jCzh
zCr)o`_jNR1f6RYD@LvW0p-%C4Z#@2Hfgd>&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7}
zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L
ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0
zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+<z|aeh_?Z
zOV9jS;NJ`W1)a>VkDG4=KjX5V^XFFZPXT{WNArsf<3{<oAN(cYFYL(YC&%r71N=SU
zPwK=U7v~=V|26P?Dt}3s3w#0mm7UBlkDCvI-|O<WcG35Vm;ZB)`EM5ZqruPZ$Uo&6
zzZLv3;FomdHy`6~1%E#HJ^Al`@TdI;{u|&|fG?V#xc}xKv;PSA<G?@Kk<WiL1dQ^J
zInC@VdNv;fKMVYx%+CV<$bT^33jUA(!TeV6U)tKU`}c$YBKSSI{|5LsZNr>W$MDVU
z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~
zk8F4>eI^|<zZLvF;BW58=NH7?zaRYM?QQMLJMp6xw6OmT@FU<a=)|8JH-7~DUEr5>
z<acOalQ5_J4EX1D<nw3-7xrVdhWhU4IefFgUjqKTPUdID-ERf|M(|_vHAX+6szdm}
zzZLw8I@M3-$Lpv2!FOHVbNP7#{C?m++{yj>;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW
z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5Og<TLb<G@P~IazbYCJ!XMXy
zKM(xYuKqZ!vp=4N+$}rX+VAQVUwh*5^)dMVo7&pX@5Eml=gXMG-@dD@9UG&Lg+o%~
z?MXiP>uzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5`
zW=~st-%k9zxIeHEOfO=spu2Pv@%=LROR<J9sG~omzH$4%27eXS7P<?EFrR=5_jd3%
zK^^xW-x$v~gTPP08bnX!ehT=x;P=!n)Pq0nKkzq!UkAPz$H(25o>zqjg#YdYKl%Qi
z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9O<d0Qh}6@)sIx
z!+%r2e+2yb9r=8A+<ZOwN5F6B)Gt{Q@0V->zx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb!
zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW
z<NSK?+rhuM6MsXTzX|-)o<zT~Q~Kd%{aE^g|2)===5#V29XJ^IAN)W61OIFA15dZL
zqx*L(e@!%;H_|U59r+jhJ3I3Et#SVi0{<@X@9yNk+v5J40{*b)y3_;G3Gwn*4}RnR
zp2tO-z&{NBican~#ofOX{Bh6s+^>Hb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#(
z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm<nv8{bryq>|*5URUG%o2z>v2uI0fkIx?
z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w<MlgX550W+dd{<;ABFwKI+Jk2
zeA?UD$O4r4$C$ERr5m{N50#$h%C#!pA}LR+bdfCRU*l*r-L44o<EkLPtO~))Eg-{&
z<Q(9IO!*OKdYJMp4g|0ggApG6F3RzHvhs=rN1K!vEOZZ!R?|0%a-D^aD#{KEJ*|SU
z-lF_wp-U{vQ42j|3BhmeQh%1!G70<L|HiQ0NBWg1P~XOt`?&HL$D{<}^{jJZq4*#E
zF9!Y>1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@*
zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O
zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuT<qW&|Z
zfi%-14CRamOIyZ+F0Jv*O)oPrM&Fwj)yMq3$KW3OXJX9#w)SXv)}}_~@yl)_e6esG
zHualQqI}bu|JUGvnGRhYywXhH-6lS0;(im~GVxOre>PFgkA@?~!~zqGO{_3+j)@Ty
zSDLuN#4AnQZQ_F_?l<u*6F)WaXA{*vru!xqm{@FLg^6=ajF`C6#0@51Y2t1ZA2e~l
ziEo+ssfj<Es1}&+n^<6Cv56HX&M`4!;z|=Yn0Td$yG?x1#Qi3|W#XqM{%oS!*L2^+
z0uzf(tT1toi4hZ5nz+HlD^1*O;)5pcH}NeKKQ-}Z6V-mE`z98cSZrd2iE~Vhn7Go!
z4JKY`;%*ZkG;zO)Z<+Y1i9egD_BY)(vB1P)6Dv%dV`9X_l_qX5@k$eSoA{uK`%Qey
z#7|B9*+g}K>Ar~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0
z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E
z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{
zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK
zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7
zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q
z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ
zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js
z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ
z)(E552_<O_Lil<STumDz)HjLHI!V<^K~s(NvWo;4-p2nD#G`5=DGlniF{%gDo1jTF
zGKZ$%z@f-8djXD(_((azUg-mf=#`Z%&<myWVSJ>V5Ak)8TO^cQ*yvb+55HXKB?tWP
zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@
zyYPSz@f?PVB?=>5OBTEKSp_ray<sOy6i$mLxd9*M6k1UPQX;rk!6hu(23=~AV7i4d
z>9Rv-<B}w?%di~8=!2_(Wvp}e2U%>eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i
z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL
zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr
z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u
zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp
zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8
z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y
zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0
zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`<X?B5F^boK8HC7-d
zfW@miS)WTGCtmfNf+Ya)+H-hcu{XrnA0mPv6OQ>JV;gPjR&Z8;*tJnU9VYp8U^4ZM
zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB
zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r<w0YK|jlNYAa;TYDABV>-IWir%
z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR
z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc
zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r<S&ohI~PMvF?
zXM{5yC%rvFFyq2?3^D107c(zfhdU?IE*lL4S;ByDK4u_W7(i_j6Bl91frg@Hy~o@e
zl|CGU_S}vHoOq<XV;&rZUuNO21A|G3HaenF(TGILFNXUl2Aw%X1)T_}bNU`5^pF$&
zLqtl)l0w7)rqkmw2gj9T{uErbgBD!Nv0w?Vb}Yl;a<FKQGFmF!)FV8`es?6|%TUzB
zOb8pYqNRCAU1N3qikjLXtHZ^W#lgb*y2cf&3l}%87_z9YwS^3k`np9!7FSmvFA|EG
zj>?%<Vf$KBZKF7>UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr
zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^
zlVm3CJZG+<ZZ*`y4Yf8{DwqrXdFOl#{T~!nx5|=a70Ok(txK{Fmsy+E?>w#Fdgz8|
zZ-G&!&4qfOAwNaCURv>p<Xp#b2QaQ)K^1@>g>@70gnM;rt6QUyBYYXF4b$>xc`tm5
z_x4PjeA(Tf9K1{tr<G+^p?xAi9j<DV=Pe#K&o^}5JV6*DKEuRkvb6eoY3cAseNxt3
zp_i(LYm?7|2E7)>5N&mhHAoim&vA-~<e^IZu)A_uB!b^3Q_Zp!_=h!BPCT))V&ar>
zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}?
z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY}
z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWV<NaBfFkwFst|%!
z{Ghv;>X$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv
zV<mdsQCem7lBHC;x(=pV;8DU~)oWXpbnpoh+Z3fpb^S5~V?|>N{={Be<E>s&)eJjD
z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo
zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9
zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv&
zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#*
z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r
zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e;
zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid
zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg
zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y
z#z7NMAsAD}rqH@irl`H>U78A{Z%o<I;H=|(8_Q0Y#tqJnDb=`daa&51oe(D(TUlbR
z3OwGTSuu-tn{xPxUrc#loHQe*d?HSo8&e*0ZidM`4q~C?<hvoJ7IQV?VKgTep+k?6
zqP%0@*dRTYtu<T-w-R?v_PHTO@teXZeKF;aCh3^)F$<%ruJOS!vC;J@HVHK*GFnt;
zDpwsNtu>V~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo!
zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o<D?|?{$_Pcm#DQ{x*T5@>}Z*U
zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH
zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X<T
z(vzqtt2_B*633OhJ`$x+Y#q2GPomWL`fq1?8oe-LQAd)YmMERNTc?z8JI>)z-N~b}
zFxPiGS|(u^UelQF<g;mKxAa~ejmOpa+k9NzQ6Z7Hqc*M<e?I8$k#K`gcaMa-*SdR*
zU#54%ki><{T^<I#Oy9S6A&cu;P#TYqXR&V2k>B=CE$d`NRJ($HxC?!-N$<M<U@Yhi
z*A`wdHX6^uAL5ybmvxiwXMis3kL_;|vf|l*mvxhWZUMc!aD5~UiF_U^H&n5Y=n4q?
z*Pc*jkVN>!qr+pupF2!{Mi<U-u^s3_ev2s|>vkB+R^8AwZu;LWzsA{6*xwg)M#st*
z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1
z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp
zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r
zo<I-w3^VQ7XWFyJl&|PP{=^>WF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($
z7L)EY*RPXI`6o<zkx7qrRNgk}(@c7@DSz0cH<|R<J|e3Ujn6G6J+|s2R%%2%@9kp$
zFq3}3q{sX-%cOs9(jBI|xZ|<gbUdpEI&RK}yIFLBnU1k~V|x$ss7|`I|3<@}Bno$F
zH~(VLkFTGFce~4Pk3t>`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn
zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4
zJXe|WCHIvXdNH>sp4(0NSo<Ec{|Q4rD%0`%dJlBmHi_^$XxQ^*hWL4=Au(RgMLs-~
z6fH;nOnNV@o(cKbyk)F&-^ZYvLLFcz=ptOPMKM&%#xvTqXZ<f_hCY^WPd4c<Jr$)N
zHMp1<HtCl=7p2DrTjztGC;U?xrNi6CBkp$SE*<}A+OyNNXR|5)MGx|Sn)1cF$_z=L
zDK9Pq=+1upEz*de>z<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X
ze(2PwJ+biq#k9ZIEm3-G<hkFZ|LuB{ZaQ$#q|bXbN;l4mf%ma#|K6%7eXuF7VT2~q
zF}4gE%bz;vB0pU7K~#Q_s2K2MfG+a?-Y)rnf+_#jBV~r%i^dl#_$K|0@1yj4OnQSs
zH#HsLX;c0Qvs@Twn!$e;U0ZKA{bBs@tK0PYr3ZSVEgG)3A1E_i-HCQoJQ*haw$I88
z{aQ0a=Noi`)aipCj3M5RjbCE%Hr}9TQ)~q}MxSchfBpWb{R>RnPVT|}wWj=*lcVzg
zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W
z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD
z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k
zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7<n8
z@XtRZ-qxD&RsfoK@YC;Z<0oR!9rW(|WrkcV|19r8ep?Up+dy~WQv20i>b(a|``4S{
zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq|
zf1@d1aY<C(xT6HTD?k_NMdtPjuPOhDA#Z9rz^|YSe=b~DW>7yhdH+U3Cj9B`6Q!GE
z8i<Z-cm5x3&=LR7MCD`toYRB+${y$!_dwrm+W)&*|HOQCUk~#5b20qspvAkQk`I~j
zi9Z+{^dBCJ(qrlPc@OsdVao5D6P16{v?s*{d$@zVQaOpIzwyN;CP9b0zVFHm%10*M
z2f9edX=XZ3FzK}hJ)8EJ`E9OA-)`7rYC6DEJ<#9mf&Oz3^xl1iKaZc++K#U0|7q=7
zfaEBT^B_UMPI5p%E}KVu1PcSPHamMSoiXvA?pD&FlXSj^EM2fiv%7P5D{dd&$LWp)
zM=3)k_z@sZQX*A0Bvdd|pe*umAdW$R6(Ch$Ikt;9#-IuzmDp7&Oenxq3g_#`zdQ5q
z%t??;-R;cu^#AU^yZ`>Dd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9
z6u)+@;2S=0gyEb;<mVa1zxZ2%{}FxqoWk#WRN#gW|ChpDou^ZZPw!*kQP;5na2l6e
z&y#U9bi1ALq33)(%W3{mQ|?jxqrW8hmni%<6Y{?!0sa)=as1&e#_w_d>R+VX52~P|
zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo-
z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k
zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae<
zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv
z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^
zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr?
z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`Vq<B^xI1b`TSJNwe{P6
z7<Q;WhkjS`nbopC4mh=Izqaew6#iL-AGg|jKj73q-d{>SHz@u?4A+~`>7NqdKhk`@
z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2
z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)<k|GtamZ;!KhUzWkX=5vw4
zd!H2eK25X<aBA<-1%a<qc~JwrS8>9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ%
zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy
zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a->
z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si
z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ
zSa(Q6@z<Gee@x(~>9{<h@FAT?uI5j1+R3hE?@PYK_=*%doejdH{NMeo<Ug&?HYz-=
za?rHv#WN`Xi=A_}=0myQolxM@Jnbz?J`{7Bj+;0ixeotQe0VDNZk3yc9_~uO|69es
zL+PJmO(Da_6n^L%S<Zex|2obO0cSqx30%p+Ik#8J{hOZ%9B6!O>?L^j@nMGdIRE-s
zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p
znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ
zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl
z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG
zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-<C
z9tE7}`NwZcJ*Tz3GYm&upvwiW)p4F#FMRJ-&F6G|(9(QPd_(ZBQTS~NfBM@3zhCqJ
zio!Q3oy$|ixl;HO&kJN)^Z5tBX`O!5s{d<>|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom
zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ
zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9<ULob@A||2nPI6%5y#&}lpYey!#+
z_iC20Z_tbmB;bD<aH{{ypA`Hnw2UV-pZ&W9{=C9}n2^s~fYZGD54DFtBjMxH%Z2`*
zeJ#uPdj#wZGhA;%r`ZJfqULkA@)@(<{Ott%KLDKSbJ()89ZSIfw*>f^=n(23=jJTe
zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66
zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0
z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7
zSoIc`s(F9OE8@-mhUd3doI<r+TMC*%F*P(WOmFYUAmJtbO38yyp1%r5gJylznXkh)
z#4EPSW%2?D-Fd)lMo{63xH8}L(m48q!2!OYn@xirJV^F>dAv|hpUy*FYks3qt)vRA
zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)&
zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}<Av#_>ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O
zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM
zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{<e5!>}
zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV=
zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E
zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{
z@Zo@0aK<M`;Se*vYkbCOmRp!UfE9fV?qac(lw1Gs5GH#}F#~DPVC(~zT$$ohxPu7~
zGMGNGDKf;I;<D-^52V-!NoDjWmC>J6#(*om<IfW-Y_6h<{6?Wv@)qH*7SwTja1pGa
zQNtXdKGq)fysc9sJ9A!c*BH36uxN7)Z!IskT@kjAO*!7!zFi|b$47w|VWMzl3IpJ(
zRLAsH+S$Ez>vV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb
zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z
z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn<ixf;p)tmb&R#8LGP(
z*B)}B=#hsXTo0R7u>3lYA-UDTN!4pqi{u<u;)ZC;2wr4nO5AZrl#{l(q+zMU%r?1N
z?Jn#m@5&s67TC6NFzeK@Gi1yK&@1@sJf?Bdnv@#i9$0t8p*9wYs#}?FVKk%_VcL~8
z!VT2%f}+UTUEz<L@;9C3^QJLvb^D{u%JjkQ*9T$9D3JUtmWar5B~#<;gL;+S2&Li%
ztPO!kCP=BOPrKw)bePA|=tK2|lw^${AoAF@r;CiyDw1B^G^}08&Z2F4X2(A9ylm70
zC|K`AhZu$m;n(1xTg*-FD3o<@_U`pCz^ROD+D>kuP}2gx9uys_<+ea&bv6tCXM)t5
zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${
z)QCMkh_AaM-W<M1t@8k63@PT#4KW7_>XT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT
zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj<Fskul&0ob1~i3!tJb33hJIfP
zEPkb=L-h!lPr$?3IH1~vD?7+!3_FcZ9VX>&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7
zn+ApW<uQ0YgCi$&G_g0ZHn@JXzb$n(j|;KbV5Ja%4U=f-?sIacCPt4ow@Srm7sbxI
zwZw<^Ur=FRU8;||T6=SDy0bJ0kq`)qXlkDM3D^XCUwiq^bnPB>i)FaPcA;ivuvkE8
z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7<EYs*0yGtlZ5EOn}w4!jr%kqp}F
z;cM_13N59u7(mnzzW^WP;prk_Lk~$*M8pfBlB5Vk<_M83M*u)DCP>tAOQN3AY{^m8
z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV
zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M)
zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^<eeRveoH5XMgn~)XEH^4b5mCYn(IS@A
zq{@<=fOx?|i<Y-_tT+&(A;Jd0halDGqZxS~*6u+a5gV|Ej6OmcG)<8`{QC>a`;BfI
z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e>
zTWdTCb<(m8X<V`^tgQI?(z5H4Az=y4^(a&ZRP*_2F4957TioF`Z>cW8*ubp5i?uWw
z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr
zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$
zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~;
z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+
zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p
z8)XI<SDEBKNXXW#9ciL0RTnz7RBj4(%aWb+XwolA<yKkQx-iGC6lSeUeF54*KvyLm
z>C=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC
z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;F<I`?LM`&2wpsjQ0=(}<2d)DSVdGX+ft0pT;
zSRMj|Lz*itO-5xlV0~RocQc~$lo6Mw4q<1OSFRViLAp#=x&glk$v!zhxi}5QnSP0u
zZEf$Qv>Xv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~
zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh<jDrtX&S{)f_8-|P2q+*a5mo=;~lAl
zZeJZ!6lOBfD`>&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX
zI<qDxQBC$4id5E3Ezv&eKe*^OdG#{0b5w1n+7NC8_$XFIaZOR?5N10vr6amu60Ih1
z*{+3&<6X1t8I#B;X@J;O)1t9|MA|Z`Rvb#J)0u2jjcm$n#R5_m*WKz`M`xJEk_wB{
z3M_MRO(q4=Dx^)>Z3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK
z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQe<J1q
zUf<9{1R7tk0^1&TKP?5`Qu)bJ&?ucm>o+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ
zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkw<JPlSln{SWk|AoFguQfRZB8LnVC10|uaixO
zpM&s4Ggmw!DPGpGFjnJzSR$GBK=UV*pbjobhG=3>w9^3@(?ul8Dk4L!#M&`0^PrpD
ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE<S;$gjfuwlYth
z(4NlG4cbh+Xl3EnX35-485#|AmpC?JwlwM<+s&>03?uwb!Y6Yj%W@MY0CYL?N=!)V
z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3
zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m
zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0
zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl
z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$
zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonP<sxgd}ccG;jk8py2RMIqdH9#NcE0
z>ei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0<I
zlgd}E!yp~C9qTGCmn(|(cf_*(Mybg#;sgmmTcLRpe^FZL3>X5I&g{hWH!n{?&8Anw
zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k
zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5
zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk
zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg
zM+<HF@IFu<-q6dD&GK!|$>k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y&
z<F|#(zOh!W&eRlBm70TSmd4J}`I!9zT9naNKdsB=nIlK%SDOMB!xRuIImE?5iZkO~
z%C_dGrUjm_QN5{a$H_2_$nrbhdgVB&#%dX%nsJ7%%;!aOg)rGcz2>AUFnXo1^}h*Y
zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd
zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~
z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^
zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w)
z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C
z4c|I6<sV*`W&FbmxR&qmIR`zTrDt>6g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`}
z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu
z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P
zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD
zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA*
zPq}siGxH68?DpRa7-5?6+rNH<QTJ>52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl
zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*4<x}&*
z?Kkm;KQbigkA>3GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM
zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~<d;7x2@M|5_%P|s`C_a7
z?fA-BNnbM+OGtm2mEQO}*G46Muck96Q?9xH5YiLO6l3B!-m2;8i)G=XH++L@nkM*j
z(iYsLAF$wrONTjAWO{QgPvrNQmcKs?#I=n(o{)aym{dGxHP}u+n~?s%grqnA^QIbh
z`a(kb^}8kg=~n;Q>8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ
z&%Uxx%D>Yf>e<FU7jUbJjQPF$CH)tz1lBE=ZFLc+-##bMCoh+R>2v`9W;hLBeHUPH
S<)8BzNq<b`kmS<c*8c@<49|W5

diff --git a/legacy/dsaX_wrangle b/legacy/dsaX_wrangle
deleted file mode 100755
index f839b14c334758201c3b8885fb58a899eb6e804d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 99600
zcmeEvdtg-6@&C<>gaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_
z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m
z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b
z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z|
z(<};uPnN~pEqRK^a#^}umM&+ehL+z<P5sD@-&q>pKb-~~KW0i=ev01rm-Bk{ddcpU
zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;=
zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>}
z;A^&<yOo&{iG-bo@9FqvAA0SmZSOT7GWoOMsK<W))o_5~emtmrAcE84z;}YvK=g%i
z;QImY#<%~c3JMIw=W`Ga1ivm0{hgp^9sNIt#-V>6gah&aTO9hiap=?H(7zG~zA6rU
zS{(S<aq2xRPP@DthffgnZt&{=IUo-G$~f?aU@(x~M#rJQDGvPcIDDRt1OF%vpReQ4
zFN*`eI}V@1IPhEJ@cCyP_$_hZ_r-xf7^mKo<FxOEap<?j;s2*N@XO=CZ;Qh}Hx4`}
zj-0=V!~a)t;BUs^-w+2r9(>&R_Wy9G9>_mO#*uSg9QscIKe9h_<fq1=&j9~{;?E;-
z=s6w^B<C{F?`LEg?ryi#ZjP6`kQ;7f7`-<N*f;}p{A}0sdHv`&X?WH<Zju=$9|Pl3
zoxif)S6a@&D90!%m^Hnm+*j{gP}Sh~)z6xKW_4|iZ&vC2YM;Ulh+%o7q{LTWUsGFB
zU0YV_ud1yvsvCU1%UDAhd7lh|27g^y-I9{B%FB#~%7qJje!%MLt7`le0`oOh`Hcnz
zFe+;6d^JXGoxiH4+^7TV+Iki$Wtpmm+Oo-$OBx!=N^2^N2H(Omf3>hLExU}IN-9dL
zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_
z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t<b{qJ|o+S4l~QL*)j4d2OTLSO^vB
zmKYUf)wK;iW1%G*ENQU(f`!KX%YF5=sHDulq|R3|pJt|YO6w^ITCCncosCuHMuoCU
z#p3!Zzt7N>84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP
zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h
zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM;
z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq#
zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=-
z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8
zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH
zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg
zYZmjf-T^<r1R`y7zz=l5I~?#V2fWh(Kga>^a=;IEz`GsrLmcq113t<D?{&Zrb-?=^
z@WUK%L;Ee;{BQ?6%>j2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!<jsrf%0iWc6AMJqW
zIpD`Q;9dtj+W{|fz{fh^a~$wv9q@S$_;C(+r2{_B0k3nwIj1u}O%C`36Nt3g0YBaW
zU+IA7IN+-s@QDuiY6tuT2YihKev$)zp97xjfUkAHPj<l9JK&QX@HPkh6bHP+0YB9N
z_qOax^9ECj|K>Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6*
z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE
z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_
zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR
zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V
zlE0aGhNy^J^4BxZ&=kp({58xU#=If<tC(jfiu8U<{V!+U&3w1y>zQZhiF8W-Qsx<Q
zB5jhN&pbm-WUb^cVxA!;vPSYVnP+H;tdjgW%rm4!nk9b*^9&`CI?12HJVQuip5#wp
zo}nXBB>7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N
ze-?R$fJnFGKVzO<Khi1rkC<ndkF-htUFJDuiLABCSFA!_KKU2q&)(3C%lzW}S^0rI
z#pioNQ#UR*3~%6zr-uOU4IMgX1(IdE`jGQXjOIT@w=DMtlgDHeA6Py*jC5jKxEt*i
z?m;Eq(8v{F5!n1R`{l&8P3&HUA^+%0!h4{TH#BYZy!L6MFL~nu!$3OkjSb{8ZFE}2
zDir-#YX5jz;KT4~;Mmd^buFI^MlIp8mcGoz|MZq6w|En}qTP%KpuZdRQqd;%+KFxX
zTpJqU>rqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~
zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^
zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT
zV}<K*h=~v6`Kc+-!w`2vI?H?+-udrnG(WH{@(_qUt^aZbyg-8a_7;=5GFJF9u!JE3
z6^0Q+19=0_cmwU>Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C
zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8(
z<zuM=D8v6k_YkRG5RUr+UJ$%ubY^%d;sy%2mY1Sw*wmHYz{iy3{coaCA@M@V%J7j#
zXa?^KgHDPN8kuGpeh?&pLg&GVs4y)&216N(rxgUY(@`_S`vZa^DJW79Xe$hSDy>ai
zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R
z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH
z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3r<RY-
zbY109O@X0%7Al8L-QkLJAz5n_jqh51kFf07CM+kDCF}e!{7Y!nGWBHzPX@dNY9+W@
zIg$pUQfu3^(7XiK^7TNW<q89zgpWf@P78ET3zUz}X=!VAwVni8FLarNUVID8kB}68
z^=mY5VC%N8FQBhp4XBtARZ7R*dgtXAO?%2HXlYLn`xh*WHvbH3-xn!UdS5hZnlthm
zx(hhW{w3RAcx-~EM%aT2gjM7O^p|On>sd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G
zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{
zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0#
z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE
z;G+aqAus$7u#3E{khk?mt|#&VAcKXB<s)zDN8p*bGkgPRv^#_>G}TiMu?!Azj}fl1
zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~
zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf
zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H
zMzJ@Pas*QE<UViUeg8q2f^Pv0r{KfpM!PE}nB&XbXE@5*d3Z#GgrVGbkraE!euwhj
z&EF+?1HInO;nTed+r2M+=g$JefMz(94CC=@p1K)iM&t3`mZ>GYDyBR`F|1vRTzEQc
zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R
z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA
zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E
z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP
zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)<zB~WXrQXGttVj^lv=xbxFJF-)*e&xv
z^LZC<C2r2pQ~_<%pilkYIc-Gjmz-MA(w0}Stgj70qw!V`1{7X-Lcgn%ATYOMdRg09
zp`o`STpRhD-%Bjm+?_bR?9FpR8&4#mh+UZESwFq(+390nrzdUuCNtIB(w@BSn>mBF
zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL
z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S
z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@
z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N
zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>k<vjRl
zHv|7?AhYMOkuyJ!M%kMnXpT3qG#qH$=?xTj1*Ug<gT-CmK&KaN?@j3Rrew<C?dkKj
zEDalt;}Hq2dkP$bP-pXo!%aC+L)tG8Q+rVPdUW4t<S$ew=n12PB?pVawy`hh*@?;9
zuQAkm(ea226uBi0=D8898jl6{i8l8`9o(P09Na@D_nvtWr^{%}v`Po_vb_OMmz58#
z62Vm^lN7j!X_q&_tUHh3c2uGzZO`-adOd%heTL>aANH99`*Z{vw+4#a+C4jn^R|1!
zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mc<a;b
ze=yW1A4j8Nr;b1ore>awU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I
zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#z<B033D0
z6^iZcYEW8(B>XO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7
zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^<mO%8>BE24?QEuNhTw$hV5
zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z
z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7
z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc
z1`<s>TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z
zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp
z|0Fv`egG4xcCL?gAVB#dl<x{OzJl`T8PnTPzH?e&`bI>9UeGZr^f7Js>{^F;Z*cl9
znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5
z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k
z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt
ze<e_ee&6EhBd@;HMO~I|laxpj;#u={v_Q(Y>zsAuiP{ofwgf$|z#m1Zpr_Y+s;8s=
zX-ld0yu28zwC7FgmoIYW^{`<Hf1bq%GQ-U2{j4L6NyGv15To0ttx|TjReeCXw&!JA
zRh#wF1KRU!k=pa-m^AHqT#HBH9O90iE=-BwiAP{LqCIiT<CZ77QOfj0+=yU=59={K
z@g1}x>&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr
zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^
z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ
ze)pJD<h4$|w^)3SkZu$7bb2Ry`dnANhdDHBYL~6Z_26koIz~#)2-`h-)}hK^@tz>2
zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0`
zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?|
zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|B<i
z>Y0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM
znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o
z4U+j7<tUa+j+-#J%BA?J?%h&n_$9kp)|10B<VoC~mqsttwf8L6DrN?YcYDkJZp932
z4o;1O+Pf~FT~{IN8ma3-%f#|+mjSxPvs-4RK)Zu7hzC8pF^{>r4uYBH>G_4Op3_kO
z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK
zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY
zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xf<nZwH?^JNTRu$*ag
z%Rvz{Sl&Wz51ZV2rjeDnLmpbY5gTZpzINO(hE3sgb&bWN)JGzGi(0x<)rtiZ*P^00
zO0nC11uo0gQhWO!EzQO_WI3C%9BIjNQH)xMkJhO9MyaLNFKkvPI9QD)tJmNSVx%6%
zZ0JEAP6-_*PjcPRX3f}R8Y9V1^X+ff<J^1LU4wdR``1HzIqPZJ6~i%EIBtH(GD5CH
z@-5_eXn&4~H8C79PC_L{T!ci-BRQ80${duiIo`2bxRnxMTN0x>PXn7!@ICCtW(kE?
zC9u_OSs{<>4s2<H57*Ip`p??*HiQ`=^zxe|^xj1OWb-mv#;Y#bBY5dSrEe(rK&S|h
z5DSIt-!?t@6%|k^&Lj*|or_LVLV72`{=d<sF{8xn=vy56SsZdK4%aIV5V}4556&~d
zrQuW&9gt@T32=pXRt&SSli7Q3nc{eCadN<@!xAS49Fi>#9T4X&Cx<&N4oTpkZG(Ly
zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R<V;DEEG6B~z?QZiiFr-GyE=hVPubW$|^&
zc#aMVvlJC6F(mEvhR#73hCu@(bo*NHY{0SX6F3`0qQMBQ1DF)|ZeO0rB^F~r7hI2w
z)zU7QOvelNn5_p^m~es8@P73vT!}8p5@=PpXF{iIU1rOcgq4|y_hF7ks=CRZuq=#_
zE6P(Pyt<Cf5IVWTa`7;+?YQ^UlqJuN1q!pK?JH$6=AR3~u%7TkCD|4#dscRa5;}s0
zw1l|@)`Z<oJu5uuem9hayKUaUDWn-V&%rK4xY2Beo`VN!gR7lEKzCW}pLzDQEZvjn
zABx{({~-J(HKxjOQBR5#uzFw)dmxtesFst%UP+Z7xnW~vNqF|Ha36$L4jPx;XGuHW
zA?@>*LE83z!nesI@>8@N&BpGU-1t*vU&eE99<Nnvsrm6IINpOL$lGP}KHkB54S8Q?
z@%{oRFoAIB#+(WS&&hT)e{W<J2((f%Iq#CjDBcyPTd*ae;|CX8EpqiA2Gr3@Io`u|
zo9Nh6*W%elf9f-P#V+)URZ3NBwNB^zh|VWgb$~Hcxmst#u9hmHBH@Nk1hpg>vXkJn
z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_*
z=<(q<!89l*Tb4o14b2Th!RKjZNHI=$%A0J}#$+<kcIs6t;>n&}^?zwz+G8{h``ne8
zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf??
zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo
z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ov<t;U*;C%X+Y;QlX)uU_=Ual~?hV`c
zdiqT#4XG-1yi@THv2`=T!K{ung|s3yoRJaSLgW=WqO%!Ki@eHvd-V`NUG15hH88Z}
z4u9PcFp;^<)3UVFkeQ(Pa4C=b%_vR(@8^<QJob^8=x!FV1<5l_T%yWwi84SCpmoi#
z{LTed5V-0t%fw?HzVY$`nE0(O?Hdfs5(KQFQG$TDgLceiyVg+Wz*CiQrgDrd`gSx=
zA8wiQGy@0EY$cNZrDo=X?7H2+n|^sU>z;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg<
z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd
zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC<u-#$++jBpl4-pR_m&ur+M<0
z##Dt(Xz{E}0OC~7^7^-%pOONN*zgj=#Hk=KYuj4Xi@E|}*Wzi0Bir=4tDp|o)jUOM
z@hp$Q%g)|p&vMsQhXQM=b+xG5>|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I
zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d**
zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~k<hdoJiY1}
zW@T|-fYSr|mp=dw(3xf~{8TmO+K+>Qb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO
zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTx<z*E>haXjTHTk09IPq0}!
zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55
z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ
zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7<fM;jW^L+e?H#T=!uNmXN
z{i*25Ho5&a%Vftnn)$^t==sgxp(lVw<mXxx#^)4Q>-&IFU%k-cX@}EYDW(jbSTiU;
zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5
z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO
zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C*
z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G&
z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV
zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3
z^1<S;waUo^l*gRbv|feiKw~c=Kfazm=i>1uOcwV=iE!yv%gkz<lH)|6u^YT#IoaQW
zrFd;=fv%X$<D~AzsS*Nu&KQtjtcOfc81s79pO6ui!N@WcbW7Gd3D%-JX3Y5xUITv{
z>}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF
zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m#
zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(a<Pru?|?$@
z$Ed*+0iIFvy>n$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~
z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T
z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH=
z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v<c<@^Aa-GJHqpxH9agzm1Yl?
z4aftj7&4Q1HlZgXZ*wHnB1ayD#kAZ#^R=Xy-FNSMYbjV@%{azsk+9k4TujY{eBqVP
znQ~o*I(hpK#W8o6y6uI#Y!HR@)@HxmB?Xq<B`#geP0ujy1nI4RP0Jp6K>0QeJ+SzF
zDEl-W5_WywYl*+`24&HHdzIU<cNTuuGV4P>h`aW_8k@UoVD78Q{W6RDjPK*l>4hnK
zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU
zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr
zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF
z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl<kk9R!S^##1A}5zWiwhqMh97p
z4%4?=S$(yaI|s+03&omufN_xry;v8^Ohj&Y)Ps+9l(2RFI|O>6FD~C^X)}>R+<?5&
zrpz30&B+Sxm2e)E%d|-zl?0oqHk;vOGu>h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG
z{}W9#n+BWB8nS7z*!<sVlB(o7-?H!ty2ad>3}KL$*27e9fNknzh%>N7H^y32%?fev
zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7%
zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y
zm)E1`n<VR3jh;-$aLUUY2r6SG<rkKyW07O>i-BIP7BvBuOMy=NEUB~SD5pGH3NUgW
zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt
zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo<UB%O25B~jb7nVsJZWnG@`ijxSL)b3x#Vq0#
z=Ojp-2vIe2)^+S=@LC=DBS0*pZ8X{{Z|q&<E3ApUt9zE}eSCC3j?UI?Z07d<Sr!j$
z@0+8+-rVjNZkS@CvGK)xXi=J6VbZC0@u`Wvag)bXe<0eLyjq>&e>RJARM}so`d+g*
zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55
zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl
z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JA<S7L
zC&nxe(jO(Xyo<*gw{T+uI|ypK5F>QaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN
zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`R<t98b=N#SL^I5%XQSOnv*b$RKxH9
zfN63yw1)$$$PhJQH$;llKpdN`JjxY9K@mnjD0MBKmNB>U0;^c*s=Wt-e*9oE#UK3k
z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K
z#NPH^Is<dhov@3V4euxFLDlfL{nt4%Qeq;*?I|d&ra4PGtqP8@D;N#Snv>gXR?xC{
z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG
z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX
zS-V>AG;uJ!8^?P3-l><bvQ&54bQV=V9xWz~7I_AR$*hsLe52#Hhd1IxSF>1Uvv6*Z
zOeTw6@Mp!MS=aAcSG;@WDl<n`)wr=n6znNM9Wxtoh^^|_{g&1A<Lo+)0wc5Ozb!-^
zE6qB34rj@|D;Q!{V6{X{-#-RVZlLdT2T@|diBQdq1>B$S9jF+?B<|W<*Ua#Q-9x?W
z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1
ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e
z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7
zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pUR<hU>V3jR0nY&1+CC
z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8%
zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27
zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h
zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+
zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O
zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0
zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n<RP9y*>_rK|0Jfw{&qWKU#MO6SUa1eE(RG
z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9
zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=u<d-%Jrt>Smv#gB6|(p(ld9K|P^
zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66
zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h
z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl
zQI}iI=<Kp%aIg|?f&;1U&({&@k~FcDe0PaeGTULk{aNxftK?C-r0K);{wZW+W@iek
z-`cF4fw}K2u=*TUP(pO6cIczQ&&GqfT8@kC!lydq=wab?R^eOB!l9x%(L@biS#O&z
z0=Ci`w2rtD2X9kO)z5mHl*Z_1U~cuZUq7TJ?2)@xC|)O?Z??v6PA9-s>O~u<tnENj
zVZYcTcU#yEMQ?4;mG?XOTTV4!{3EK<J^d%KO@$XG1yg=vmmTXcQ4`Dl#!>d)+DLZm
zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of
zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2
z9KGe^MFW~5mGzusx<Q1m2SY{ZYvM`%Q<<@*N9=X?+I*Z1@`(q05-mQ9H6OD%E|AW0
zsa^J1NN!H0-ecMAFs14&r|Po7Rck<`1h^Ylleg5%;Ym<0%2N)}>_WLOfrI7L8}Bzc
zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ
zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss<LpM@
zLgIbDQWU+>*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+
zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa
zhaQJQqolRyA7g09$<HkEsP<?Nj}XhsdxTW?PI<6FSq?8pDZs1XLb;bZRrLJ960--l
zO|uW_TEsKh#!cej7~-<*nV{RYsH{yuCpTwZ!vRrGSWeItqsb;i1+(9rN&SB1uq^))
z4QqyV=trCl>DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|<ywTj<l7ypT)
zIA2^B{!Ez!%a~}yrHiXzV(qW3K^cl{#TZVpA+6dCYX?^09b1FCm&k(A41%&#i&ry{
zyytWW3%sX6-5QV}w+UMB^%4FWDNjkE3-NV4EuKBXSOuS;XAhR^YZbpZ-5$>ju~zL^
zO78FN)#3<NPtCD>0IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjj<pV&%Ucu%^J#m
z{tO6b&aCga+w#)LU1p2wj>y?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w
zGQ9g9yQ)cMRc2J<o620vmsxhxl`^7?)z1g_LJYozrb}j^v;`hC;tUiOOYf>wDT=5W
zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z(
zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS<h57hyGIuSnM1Si^SOst&
z?t1OA+(d;6oG36tvyd4DXZl^vY<=T!ljY&ke4f4@c{hhie?ct$V*ScE=y!v;@5NDa
z$xj?yl~>(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS
zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z
zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx=
z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!s<q8XioRxt;H5g<v-fl
z4tEwo6{o7aDuz3YQa|^)j&181Nh`nfQ44M;9K3~~EuXvQw1IaoOs$AI_9l7`M9+Ti
z`q}1z>wfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt
zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r!
zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58
z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!<P%XDJLh%`m_KP*x;w*0
zaR4+?qgwFfJIj=HYN%_SOe}otp`?Sn8*alJi#)q}l3NzaEqX@N#PDqDK&kQA<PVT~
z8}95g@ZKEV8_!JP^MAWUQinJIVUH18jC>ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M
z54<P?bNF>A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm<X{ip?QXSB$6Z;Bi
zA*EGPT&?cE{l91|t<~zJAEeb;W)=UrR%!jU`sNgeR^xAt*H$k>-~ZSJwN_iOTloXp
zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm><Gq7T@JaE~~l%4le
zWa>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_<s;fHc$_A
zxe-(t0aZO{!S@Vck7$xZ=g{v&X9<)?tXBHB=Ve)uKW~(M>5k%~3^ZbDb*f?fa)i!>
zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9
z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e
ztxeFe-|*h<xwaYa{rJq4DcH|k<JtnARiC+z<xggaB|dXq(ox~c-V6?&HN;#C%ye8`
zD4_e0^WcXCR}zh%%;^BwrPh*!Kf~TVnUY+Bf|<kK!gYmfLdPNu4ty4TRZ7AgqwyY;
z8-ku2@Ve*F%0xT?y5XromrV|IeD%@hZ(M<n&3%WD-4f_bXiLC1RFs4{r+3TdzC^sB
zD6nPoM~5eLzSI}M+ca-T=xhm(0h54db<ndqv~qvwv7uq@!UGhmREt$r+v6DU#%>OD
zIM@YH2MV(iG*rNIYjFDN0ZWg6<sjVs=6NMJ{nibyH6QmXxrdU|H}|E1g(+>wo4Db@
z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt
zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~
zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N
zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o
zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ
zUx(7<h()A1kL0yE)8w@oT6paS2Tm*do_^yE>Cs`#dIg}nrG48lpMfaK#!L3}tzChl
zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(<EgkMV<6(=sGYqXX4pScy8`q;iWZty2
zY*!9^${Tp93u~YDR{nV<O5EPM6IcASw}$nVx4pGjqjA}1Um74|AwbjY%S`(+%f4iQ
z)wJ36WsZG0Nnf_N=HZGTZ|KG%Br=71awO{sy!N^G!S>c!f~JUb%q!1YUU^$Y9Yfv?
z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o<z7P$*;zn&FD-o)%=
zQze*&52wWgSV}5R@^W&MDc@r$GF(a-Fu$b?>#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv
z1Ku<MX?6f<ax7N~%>tBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4
zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs
z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y
zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2!
zd#(e)AbJ0Fwjr!wVHQmz#nmws5?a<n^Jq_lW)<XXwyh$nHgDH}wVfKnONS<_C~G??
zhQd+XKu-hfZCz}Z;jOidjekKCZ&*M>gmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh
zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|%
zD_b)&@O4JkC<NQELg_X{>E@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r
z;uHA7@#&eY5qQ17sqy_<qo1gLih7q4b>0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1*
zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz
zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3ia<ssPH<e}18ZVYP2TAzvrkoB3
zoqeHYnQVt;UL^gaXmsVE`A+eDIHj}2JzCu`s;s8^<tpsr+5`jYW6An4nqb2hO4ggl
ziLAFH2_HRxtPjkwWOeTLzfAf;maONQUd5(qd6S~}%}2W=))WcMbV9dNwUQXs_AU7=
zN<Na)hfr=eP7M(Tb|MK^2d&nX5rT)Fy%$)LAL+32?<jd8{8d@m?ho}e2*f|@l^<@&
zooKQRW0gEbN}gGiXU8>r%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd
z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB
z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l
z<))}|JaV<lb!@`Or7O(0D|9WNtpd75c>J>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f
z@f~YC3W9<R?#?0)sUr6~i=2r-fo`bf;0i&PkpqNuFv1w`>E@Gt!iwNUnmQ=d)JX1+
zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K=
zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv
ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|!
z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ#
z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus
z;~I;<?>5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6
z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks
z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av
z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraC<ln
z{RP#~qTW!5j8Tm?B?ZM07ClBP5(f>h21|sFabPB`K+jX9A<CP0b{BQ;M1k;&&|D1`
z3oTj<csEK-x($?1kEH!=CO3fzUm2uYqa8CMpaWY}Zx6io*~8vIhxe;a>~OM_$e7Ss
z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i
zwFPA}K=<cin90NZ*_5W`DJIr<By;WH$WP95BRVSzWv^mdS$s;0DW5{g_SPKyDxW$D
zR}P<=gskmTd6F|%xuQD^NuUa@vrs`m<POR&)}>xxoniLpS6L=if0GU|4d;=vY-GTf
z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;<U626=-X9>72$
zd0^h+{ua%k<M{k#dN}AAs(da*5-Vi8Wgu|epkSWOE2sV(q69p<aoA`yj`jvqE}lV~
zt3ff8J5>p+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7<UlF?0vp=kJd
zTEM)6fPb$PJj3K-+N=@pGe@WvBc*Jgu7nEZ?!-z3v`U}_C=huG?C|P5dz%lOK5nKw
z96hFofkWE@I99P&mY(eC_EX(z&6&iZ+^3z~2ZO$6wl}1QPnm>tksy@&4=3Fn@MBIw
z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk>
zyahXe%?j%@d^f6<V`<zF$5SOy{dTCL8+l1VcpezZU*$Vc8lfF>`wGRt!A8uWP_B4a
zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D
z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra
z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{
zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS
z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0<
zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KD<E#BY_
zq6>oJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;;
zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i
z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C*
zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL
z7KG})b6uSojYg%u(1rM0=6}<wUzGx|pBg1^fX!fpo1x_c$ZIV(FM3bZfTkA<)!b)n
z*~pK7T(}_c8vnj<VQ9>{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g<Zd(qx{S7p7^Hu
z_gnFo*zt#-@dw58qa*O=2cJpsPX1@(N9by47tb~Sdhj!=>^eVx25m)tpui>mG74ML
zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s
zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I
zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK
z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k
z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU
z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE
zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG
z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp
zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC<?qXB5@fE~qbENLlOsU^0J+ySM>19aC<=
zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v
zlG-Fx(qCWdhUt9`M%gJ<r_`R(P+n=+ib_v|!qV{|>-^eASkwW4YnS?~Y8J3Lw6WRQ
z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@|
z7;wJ&(lW|2VZsD6=PN(LC<Qj%V3YYlt{Mrdj9GyHZsnPI2#J~eUT%LoaJ|-%cjD&t
zgXbY{v~ij7-+vfElkpvdZyLUI&7t^a;EM&WF&tmcvk$;G3*UqB<$WofKyx%r!WZMT
zyn%Eu(joW`!`Fo`_9cw{@#Wbl4!jNAj9?ss@1gh}jxRo^^QWq=tYJw_SqXacDQM9}
zrPWpCZU7M$O8m#B_ruqfm7ojMmbrcP^|kesSChso9)oZ2Ed*V5PScpCap(r)+`Jll
zia567Goj3=Dj$Cu2&>9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j
zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S
zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ<?#
z*hvk7@eOb4>!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x<
z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~
zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ
zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V<
zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns
z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{
zwQTzgKtHg7wEWU<IDLuJy-d41=rDo9il?duj*P+ca?JD^;X|Bf`O2%8m<_Nu|9;ZD
zRe=9tbULuwdOGA$eC;Vy1(!~xi!hUwzNDos_ce)Vx<1{XmF8BrS25g4=cq^N&6S@+
zcT-){PR@9&1hdWW7$L0w86(F)1<VSx+?Ky7$@|;X?gIOww?$H2jCE*w@gKeZp-Z?G
z{0LKwcyxXlbS1K>06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4<n<ud@Ct+ELEz4TTO
zi_t4a4#~(|$7VhdUz^Tms_80A>&r_tQ#Ek9v-QM@Z92|<zGv3#{KB!O{ppvd=&{nB
z&4IHD^UL$S(@VInx5w*#beu89kbOVD;u)G_jQ(-?n&et98Tba6W}EfGpRo>j?Wie#
zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A%
zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=<D#Sf0q7hv7ZUE9AETA3
zS<#hM`Y^A9kmU&0B{DnVirlSrvu8J^xSGFAVQj?|Grv1~OxajXsrgseHuz4tKv||A
zSGXi-_d(|vLnThLJ^S+`%%Q^q>L3#AgRgTA*-tOUe`c(_j<F$D4(GZRg7{5rgTJ;e
zW}1nxt>lsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx-
z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o#
zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X<Uq5tO{*2m3ZejDB(GwLzw0FX0FAx6E;Y
zNv_*%Z~hnL&zw;(<Lo1_y~oD67%PXlw7fK!s<+{C1$Nlg!RM4%Cy({60qkhHZhwb}
zv4f-Cob&@Y#$JA5)5kc5X^D9BWu-No3bJDNLhjl7v0&qt12>_uaT&LqwQ0C-hfw7k
z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKK<hQS{wn0EtGGFMf{aM7-w;5gS(-w
zw9Ln?kt4)C+{e*cbFmJU#i!!y@Dp>z7Oz6L+QKy*Hg0(<MzJ6F_cqht2B_C6H*joI
z_32<`ZzD1syR*&CjX}vtXR)LxTz5;bQyRpj>nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq
zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx<mo3H=mFEg-
zr(nzp*k3*g-v)Q_tTT-<7mgpZaQv9^@ngKFjG2DQnDgCZF48@eN$=_2KW)7Ev)3*^
zjk)IK6Y`n%|ESsB|IIvXAm069|4p7pS0&i~$IdN&Wy}#aCcMPjjw`LEcPzodf}DGC
z%L*mNpQetQRG(Jm1<c)FCr}G#@wMlB11{AqI>)SGAVQ1Ip68YDkYpfQieS-8h+S0W
zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(<Je}^R&dCn=F+xY<!VRLBGebQJj(oW
z@wQ!KNUZW=C$?k^+OFSWZzn};c?sQOXOqUlJ7<L^5~+o^?CwB1@wUB%Af_B3?!oEC
zQR&9C{8{-D2ud5us;Ww~56W6wMWu!sIoZWA(wOlR@v5gW<4?$Gz(KSy#ZTOFW-%Rz
zW9YbB!Q)6PRN+`nModHvbE97G8GLU!IfubuhzP{9MNB^S)@Pvl*mQOob+X9QKqy*W
zZhPPN)-QZaXH{&J6|MNHj*4Ve&apik2boiO@<R(C5M$xZ151F$nujs4D}uM9Tq%ml
zu@3e;gvx;~t?=Ve1K96jj)Oq(Jqs!-TkE4voL(v~tZS@pz|0Nyr{oETB%L|}hjPns
zopw<)IuF-#=0>9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH
zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU
zHzGYA=?<jJknTbHGSX40hLMYN%qd7$;Krg$kUoL*3Zyl-UFcS%%W(tNMx=-0*1{b~
zuf~0^dyuY2ItmAm2jL#)DM&BIGZ&X2ZAE$o(mx@+73sKNp*+&7k?ug6aUIGdJqC|z
zjl#iW`wb|M^yr&V9%(PqE0Au$L+7_5J!3V>BfSgh4x}$4-GlUaJPbbyMqh$-3exM4
zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~*
z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs)
z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$<NIQ_WA^jL>H`1niP#%Z8e?sa;y5L@v
zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L
zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh
zX))5ZNPma46RB@YG}?<af;1Bc%$3{F50HL~v<PYQ_Gq*Y={-nSA&nwki*(dqQJ#6E
zy+|K;9_4Ykz2<Kyk8~5#BBXUMpghuJUqpGN8<4I=>ghyzq+9-u@<_8@MtK}=*C5S7
zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_*
z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx
z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3
zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C
z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw<Pz)pJoF>-
z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C*
zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0<s;BRLOScut7-&CqN;?
z4Uix3-H_o~+Eru+-qVtlv?+-s!h>&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9
zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY
z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%`
z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o
zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8
ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!=
z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&<Z-iyPF5r_^VqA-{-z}E?`hai2vz{d}_PNNi
zPZs7qTktI9b20UAv+AD&e03;3eh%;lU>>qErhJQ4z6tnW0$&<~zr@0?27VLp<DB>f
zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K
z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z
zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6
z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRr<nM;`Vt+4se0sc+k
z2l8(>cs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7
z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q
zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H
z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l
zzgXG84cHpsHv;ePFT<s_3|m1n6l<kF#@OjT%T7Ci{~gv$tugp!3!jXM+jm$q4RO|y
zG{_oDMgt$gn(2I}jEOTWJ52+A6xK}VJNeU21yY6BsS-5j-W`o5$J7zEWLOFO@%Kif
z$Hd@|v}Cvs_>=D&xUFdC4&Yw{ew<T=q+=}pVc_4xdTk&bSUzni#w)Dbu87f5HG~_m
zY~bsGKhBBQ{hc84C<M(C(8L=Ps)4_4ANXs5-wgcoh|RXYw-2JB$^RkX7d{fdKRyrq
zOTb?lW5dfW8}0%==g-mTjo@YRPh2JKNd7~y=Di2_mKgjmE&Mp((;kmT$2jrx1g-Q3
zei-nwgKqZM#A2&_HSlKxe<<c2R{cp?)|~5F;HNzqjlSpPpZJzl{vqH~vG#p127j-G
ze;)Yjf&VfF|EY!F1$^aS;`ha&ShFtx{*jpS4_W2M0pAC_Ji=({_qc^G1pfMsSkuOo
z4_f$Y;1i$X({}h+<yTnvYk^+~{L+~AYqHw!A>g0e6phY~DX$g{Y`^D$H`?OkcL9Gg
z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7
zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv
zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l
z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls
z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa
z&!2y*Qa<O-PudQ*q<rR`A6YCT<ummBWNJM3H~2B}Uwjv>Wq!3@1mieIKAoC?TWI`r
zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3}
z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H
z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!#
z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr<bdgS1==3I?-mBAfI^C|*S9SV<PQTRYAnkAm
z==5lvo~+ZebUIt76*^s{(-k_sNvHSfbe&GO>-1HfexOrj_5b`gt=p{&DF<#^r?O|C
zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU<PMVN&(pUvGrhr~KtLh4!+T-V%JxxwJ@v1Ac
z=jiIpgd1*&#*(AVuGD#h!pLTwN#sZs{hZ@W`raEAQa0-xbmnCvIIGT*me2<oh8~2T
z^d-J|rhu9<F(K_;D3X|%dM5gMVsh%UpiWFn-O40k&`#iz_(vrNjTws)iHRw>2mwjA
zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W
zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB
zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk
zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ
z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0
zA)vZGAmd%~xuqE#ve<H)kWCtbmKbsf?cvHEa^^`Wnc0QR5Kj<fne&J}OQ1KRDTbWg
z44g5TcAmycSne1j>3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6
zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I=
z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_<m3`a}u#Gy%Q;h6aflw-Np+
z4Vj#gh9=6`eFOS+nvs$YqO>&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr
zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{<PUzo}0V$kXDK4gvBv_3s3(
zgkCBC6d-?7KP0dgjhgbA0QsA`hrq`mL`o_&N=xH!>VEjy?_*kdlmPjgdbC`!<x(fg
zCEu5qIz=uy<48SQF5l<nOu77qmviNkc1^96OIjthK`z(x@|SY?H(mzhl2%H+K`y`I
z<sEW46u+qt$mJ2dd>lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk
z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_
z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e
z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p*
zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdP<J$x(hnG+<lX82#3IbJgN4gWhcIg(jB
z{B>qdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw
z9mO;6<fW0(O^&~(=4lzbNP4dXGWcROVQJ(c(SRCAG$7Q6j6h#Bq5B*A<)Zjd-tC@w
zdII4?8)%3;bzROn^W}QTryv}DHR~KQ8%7wuQZ6UJhQkAL`7vy}|G^Yw@Vodq9*T}I
zQbr>g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG
z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf
z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;o<RJ27?gx8h
zye5~)#CGKodJ`{SFT$lU_(6~pD9zw92D3OZl20`Tvq}ar$tM}ZZbXfl880sZC}Su2
zyfFxuh$|!-axlewkH{hP!Hk`}96T16cjH3yJYdd&nZpJHn}H+T1BcUCM#dEII&lAO
zxJ=6!2@@PRf+YX1wJ(8>s=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z
zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r
z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0V<tPN}bUd
zA~!f;BI$H(=rN+7L~s-Otpq%RL2?-*3AEQj`Uz7=mI=jyaAXXYCZV}#VYq@OVio!$
zY7S2$bY5sO)p^8^kUKN<eZn6}a82lDNE)6=@Pbec;VTJl2$A0m&!U<eL-QeB_$b1!
z4Q+#T;VO#Z(Hwe-;Obhy*M~kNKC=n`Q0S+qJzPVr{2)y4!Gz}!URmK{G$%Zd5)D=<
zzJpfbg{K2`p%rdJqr!_R=Ta*?lK37+iKngbB+6Mvf&>GqfEq>_(xD_22_R{rTk59m
z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt
zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N
zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2
zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix
ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;<
zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0
zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY
zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^
znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE
zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D
zx3+7seOH;*zOTSrO}Pd(#{?)c<!+rMv!f}aypnicGJra1Ia9s~fORP)zlUVxH$?n-
zJzDnMITTq7`K>EV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H
z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53<eg>p9L8os*dfM$IcB#qVb8x6)
zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX
zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M;
z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2
z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`=
z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h
z8t^-)q8k8wlPU^utNv&zvgWz<q?~$=4j?z5Df@_DhxG~Z8w>nTiOmeoe3#ik(xw7?
zi^HLcI0zA`iVmyO(i}b}4z<i-M1^j#bqolqsU7QiOdX&SyLO?mx*12JyL2<w0jNwd
zku+nj+l=4bfP(ta><%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj)
zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZm<A<N0gN@?R658JOL
z<Ee~u*>OgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D
z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1)
z%Lt|rHF^y8VONdRGe<BboPj>9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q
zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL
z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~Z<V|`Q;X>A|XG?gi{bVAq}ZrTeiX37S#
zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo
z6>52R1E{<?H~+slMsxH@7(l;fN?Mj!i>(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe
z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU
z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W
z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE
z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@<huYok>SOK@Go
zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd
z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q
z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6
z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu>
zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w<V4
z+Jira@q2+EcJXu8dYMK&Oh>?s%@S7CcX*gi_TYyYe=C^I_29qE_<Mj~qw!62>7wR*
zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F
z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=<E9Qy3J%?x_J2h|52bjp}
z`<bE)*1k|v@PeP*bN$yi_c`RA!MRjx6V31S&vNeJ;7+p=G%d4Hfql+CBa*7mhd^{h
zKz6NDg{oh|_@jXTro&v-zsLB;V9`Hfd{S3>0V<sZ3>{SI2PU_`9=FdRAz6X_r@bbj
z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J
z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym}
zGmgv6Nm=`-&*o#Z7Ar<q>{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H
zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$
zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@I<u
z>I!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a
zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN
zCK;~+{S<v)LMQ9smW?d6#O)hMPVH(NRF+O22jeKO!7|At=_=t^;yX1~t1SJvVkP{~
zu?-qkr>r@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65
zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r
zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR>
zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h*
zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U}
zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w
z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2<EME_;;8t;NJ(0|$D#vM4(f7!Ce
zyCEn2hYvLV9n#T%_>smtz@PrZS2kV+4E=}qHa?7Aq5tqptG<W^(0|^<K;!GEmHtat
zH~tX0^j{ilyZ}t;zcku-8wlxt?3;}Wve5t7ry8e&KmAXgu<F-1(f`=<tyS|O7yakW
z39MEqPyda_qX_+%9oM)E?CHO3P9ueYq5rZI)UTn1Jf)wwTVKPqX3=NxGZFv(H+)8e
z-KTi@M&7tFui5ZxHF@+dP2Lq7Aygi{OOy8n0!IY%gO}nwavOg5-5HHYf0$lC57UEx
zw9cT?xYkl5UR;LXQP-f|FXq3u5T`<VuAcusQK&+CuAcud;u$QY=j!<%5?owJ&(-rk
zA~;<58h-OXCOA?^&(-sFq>!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*>
zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c
ziLq83d<Ki?xqAM1s~(99i|Dy}{se0U5<80Mxq5!sq8BT67SVI{{7DwQjCpAhJy*{^
z!s-Lr?jm}wo?m5s7K!VN=(&1+wNBhrM9<apXY0i6Mf6-fzeXnx6w!0_{5tEKDDzNJ
z5p<tFN6U7wh@PwG&$Z~qq$gD#Jy#z&^4lOAP0!WyZy2SYtLNWD?JfK%@cCaMP!$dn
z$D66;!9sejo_{O#R55fzCGvyBBSe-!EjVl$0KzUuLSyNJSB)*fQ+*UKpy%rO$616g
zpy%rO4F+zba!WOwAG{KY;0qubNk3!gIeWn+B*Peb&R%e-2I)C_!DSl!3v@lIj06k4
z0xEj-WvmL8;53GwvyYlc7=Eva-&!i5$GC+Rfkx!y2gzv0&~x;n=^CWx=tXtJa11?1
zFPg7GdX8SSh`^E|jIz;%1Qys24M%W>pIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA
zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!<r1
zBjvc)BVda<{y~)G26?{}r<-h}ryAq|MvmVFq&1q5eurZ1dk}UpVWTD>jlG8HH(Wx=
zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i
z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54<PW6?c{9=Yh%_ryl<D8I&3
zI!u(M+li9i3*17K%>ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62|
z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ
zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs
z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB
zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7><Bf@th_WGy#jWmS$S!8ny%c?
z%_zd1(g6NLilCWAH;$4$Zu`^>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6)
zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y
zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O
z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg
z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJ<yff5zX*Qp?-2wh-VD@rT?qoyKbHYf_~9X`jE
z((RHWMvPcdVi2QCEIU+KNOe)k&}gT?0+c^&g5~C63E&DfrUVJ%IF_;CNF_={a`^tK
zXy@Ta;%9t`hs^{eMo4{l4M2D<fCni>GHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P
zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ
zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh
zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T<Ax
zE4?){&#UG%CC`VFJ$geIqJ?P6LINhC=rM-vHOoqCwV^KZr0J61=_6cx8O14eEa^e1
z<5;iS(i^-YL<;M57cX_Y7+0XERvqspTTvo}%N_IxvXh9`(Ft4{-H5ADw;{N`s%CXN
z#?(j(SzMY38Dm34+j;0UT-K%I6Bx;>(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h?
zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{e<PYx4*QY@o>jN4VF4tjAwm3XnF
zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;(
ztlTpSv>`;y=$I<xGy*D+;E^#cU2w6f>8{NDp!!I!4b5<So!lE)#!O=~mFYR~0ci+W
z%uy~PhsC6elr|)7R|@KoJWmqJ8<DQdd`}X@BR#4GanPNg3S`Pr1(>J<`ZCC}{)O4Y
z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g
zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L
zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+B<yfkd;;~mH
zv2R+utx7UbWuM9MV3OD>jW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C
zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q#
z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I
zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb
z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8
z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx
zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR<IhWmxkJrxY|m+i^N+)!4y*0Tftx2
zJ-gpU{omQX1Bka&@L~Hdi|B9L7O1p<y2TWKW#~pE{-MAt&;FV9HzW=W{m!1-4ALJD
z?MCuZE0`Ksv1BI@-_yj8S*i1ocvL5zuu_{i_gfnAEOP((g{w9!0?F5h7K6bHR;n9`
zTZZmL;uR~^vp;WyeWT|6M^qTFuO14HMb@9Jlul54{*FrboI!P7s<rTe&bw6e`@q`a
z;4cqQnfJF*Js(@mO;?|D+9g1Iw2f4^Q)hmpGcOMWZ`72ej*A0}0sm_o!FvP22=w%a
zZKQ{X1HTI{xy~+p`Ye0vjT`Mpw(qz*SiA5z`>^(B?)u^H9}U*-(WcOuXVclfatIxv
zp|`EzVh4$KYUClS`M;<MKiu{-ct4LO*{vkRb3+fK@Z&15o;u>k=o=vas<n485U8^5
zLh>;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2
zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{
zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6
zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<)
zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56<kinHJgezz<JpAK~26o`0y=u
zU!cMY1ktCTNA&?bv&hoFDjDy_*0@HsB&1xAho?9799(tLlHigRtM+{JjTiSk2E(!j
zf^X0%e=+W;ZfI#}I`^0j8wfF*eiqTsDtqvk_NK+(skJ9HQ`(L#4z9AN?6BJ+zytyi
z2RAHIm>zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp
zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOI<DvCN_M22HD%9C)E=1ElS-SJ2h(sZmj
zBx9SgIW}rU<%EoO_xASDW>YFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h
z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9
zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@
zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@
z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l#
z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP
z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO
z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H?
z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7
zn>2+s?4mD{wwNl&K-<yjkVTJ68hq3c8ekfuu#<X6f2;>?V>2!ZQeC}?sMce$tEZ24
zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E
z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O<zKQU!Qwk1#++-Bu_3
zNd#l~W}LWI0Mj%3D<-$Pwj%qJ;fs}4747YAi}tp+!=s{Dn=w%2Y*u=AajAj^1S9|(
zTq4l^gS7oD?*geU5u)i1j+1|vu}6kAr;hYOeTYWDpTH+~b(7BK>0Vo)&`oXe&0Ve9
z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)d<PDSzx`UFny-w
zO&`YGc9h2(8@1j>JDVax<h~emu1|B!sotFC!sY3@bziF306dpiTNKjZ5(w(SP}II5
zN;@K|)YfPZuCOt5wqUzB@WH;UXfZaVgZ-*r>ShcF(xr@t<FzRco`zjt$%>8JL2r;f
zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*M<fXhNouixH;WD
zGG*)P^}I+<umkgat5&r-Y0dh`@wK%P9X=#dg~8U^xnxtK8(>e*Abu0QRdd=By;}e#
zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGD<zeLtaM18H@xUeOlx34vk
zB7&A4L(sjY4?uelk%1mSy@3XMDWj#Q%OsN>Bp;F_4hU14L2NT&@ykNiS35v0GP<{#
z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf
zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#<WTf5sar*s;2x&{(T1Efd8
zsO>Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9
z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$<gzCQyut~hrddRB7lyv1tkYZlai
zdukvNU($gkPk$HM-w~-g1;*B{T~!rklN6T8z5P{Kx?)VxnuYueIZN1k3w_hpmW(w=
z+gfnD0}e4BOAOEgK?Yb)yr;GA{A{xl6yJ~a1<moAgL}6Mi(tK=M90wjX-jmFro0tv
zkB3)Vv1Ub+Gb)XrgG(LgGp;VGnUaLY;cEh34%14K7ae+<=+QeDLey9{hO23kp1#`2
zOUld-HzdxK94}1{$i=kY!7eq$_q-G@&zu<svwwdIZ@$g$?CpuqHZg-{L!al(MssUx
zYcNQ$Gpr;A<B1M*+3aLLW}|2~1TZUjOzcoogdB~#6R8?V#yaB8T`TGcnjJBZZcfGE
z@?<K*iWB|_O8~PPg)({BPLrQ%=T(iFf@ozyiy2xZw#CkePCE6B1$k)RNkJlK%|n{d
zOP4mfK<&nSo-~UOOpuTgs|+(`>-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz
z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828
z4yHz0&=)o7q<XeA6G?aPmaarQ7H>2Y)~a?`FfB=F)u(3wnr#NW7ZO<I4PuE%MvMh)
zS3k<2-Ny1EBPBkikzihd0kvu)NhFDFR{|5N)}u<cVjtj5Dn8g{OfuP-ehZ-stGYoC
zitG~$M18^t?ZQ|?*JVsr>E6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0<DGm
z;2HGQjB_n+0F}Z{$ao4%cMtIVgwK{*FJK4m?$cqZvjyDEs62(2PWDUc3_KNJLILUR
zemV@Hq~!A$a1>Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T
z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j
z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy}
zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=o<wy#X85W-9UaB
zNX_#3&Nm&4T`Tp;x3893D+*lfC~NJfVJ_{HkDDzO6>ymA3#i)t0=sI6iW(kt+8h;7
z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+<YAk0bBtncPqJGm<Za-f|gb
zmdeode6C}WQ`cGn&v8&21&leUbpndg(754kS=2SWEsMH_w`I}I@HT&wPUGs%bgEq|
zpj2<EW&u5tyqQgsH?v9dW;RLQ%#fsszbZJB&oE;FuXa$*01x6dJvu-opnU8HH?WV#
zx;wznHstQFe1$d6;l0+cTJRR0KX<EhSr}*sYwmNd1_cy#VYp}VKF8l_pz?PaQ2y#9
zHiXMNqiHoP>^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z
zn=#^~I+-`f1<FTgQXk>u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy
zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j
zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3
za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1
zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV
z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD
z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I
zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr<tL#;GWhvvv+<J&P23selh7d<Zq1t~
z?oC{oH_z|Vym@~8`0FPbGu`Pt$DK-jMCY$3Tl?hGYguSO7Kvbzp)Wrz%;M4vvk*ne
z<slgMeaW;cgQ1@`+t8EHClaXy{3Mb|20uS-HdiBjBtQxHNdS`!etz0)@(CXaVFG?1
z-H@Zbe%fsO=nEe@B+v->&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd
zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw
z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<;
zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7
z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2
zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh<yMx90jw9W#X+4TU<acL)aC=*5y3G$X}_R0
zIayn7=PW^Wb5=<087w$h7Cd*7DJuB4B2xD`t`ZECDPEI^Clv?O=0<^~#(>(=Ah6UD
z<k}l22}bHA4*#iQ(r+ENV*#Hnb*6yJ8I`ZLsy<b*U($_?4+6VYRZO$?pU#+#E)%~z
z!o)9AU&LSr`8>l^rfaO7UNL`q-So}Nug;rxC{Q<j(&omi^Ck_T%7cab@8bdo`8+jy
zK~JgurY85--w|GYi1wk!<pPWLZMEf*?UIm&N>{9Qy5jq+4Px;XmQti9oGXZd{+&3|
zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38
zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN<tRXGm6mQt9p
z1gvIKOEn2N-ZD5d5&2m)GosBYxK_ZCToBJ^3K$e#SsvL8GJ{<%BYt;R-Q=QfVel~*
z#gKEF#gNGFY07NQ%HEXBg6oBc6wEU1O4dr|jk^UO2q#A-0DihGtVEX-;f0QWLn1nc
zB@5`5%9w0LYX>!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx<tI2#wco`SC
z!YO!~fTJC&*&twvgUYzL&ScaijBa&!Hw$>5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t
z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW
zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF
z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX
zuK;~bPRH?lK9SFB_$)Vd<f)iEr;{5(rnLJjasR%^<>iLUZJc%~4J7zGIsbk>%j46>
zIQ<ho%eA0fpZ#z74f8{y_xb!0pXD~MpTETKp1{|B6?~Rky?%O$ge%V;nwfu+&*$@5
zuCM*&CCK?3d{15Ot;<buKfT;Hk^2<>dm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn
z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW**
z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay`
zo{hQorm<d(fw2jWI&^q8e8_p5$(w7d!_@uPc`?GLSRKrYAEU14C4;w=jW|jN9`Ry?
zPl>bts5jSE<D7j*)A5I^@XHY|O2alRTlNX{X4(p-JDiO^tRh+Q?1{Wg0Nc)PsOcHD
z;yZ=uXwdV>&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_}
z8pg~1vq<XWMEqLV#;=j}NpjFZgWAf{j~4(>{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl
z2LK#0oSp$}q2G&Ho;;na)C)Lg<MW!PFIElA=XR!lD+m2MIq<?K!fQt1^Re)Mz%Zu%
zpko3QWg!CETE>gBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3
z-!g<eSEwH_zJ~F_|24+1WxU8kdqz^bIvEeoqK`4~o7wa~H3z;1c<RUSa733E_qr$t
z{Ruhn>vG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k
zCxw!<viLXJEo>~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4
z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>d<M_Nesg1yofj^NAg=GA;G5x*k4dDk&
ze*xn!Xfg=#m|tXkL#x5dJbwe@A6sGYHz7^OZ5nT2@9+1S{-litE7Ke8VoCaWZjQm9
z%6$HcbHvMl;X*Wo=+~_^2yV8T06eu<<|DE5zapSRB+LAD6!Td<&G31K>z&8=gM4L6
zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?)
z=BckSpNr2lgy%5*eHzaR&*|A5_*a<EMSO|6o!j*X#;?BGFkv%QZv#ID{^!Yk29KmZ
zC^{?2|GhIzI?Q<5c9!Hn!14?I7~qScf9V${S7G3Z&(6;n%+Z!PPt$mza?kTMe>W}A
zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC<d4m=PP<ssGgHs$OU!37P24zSQGYMv
zmtAb|@0*i)DhHq6=D`1*`8<A!VX&L|jLJ89%U@^^a+7Z=<4>7y@DH*+=QI8l9w^&b
z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q
zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ
zfbIWXCYjIp-**~BA@f<o_&@V_U(WPrYJ4XDUZ!8f_IxyxY|TM`MGide>Ph4Ae&2Y!
zpZPy|qG8O_nEC<ZFXgMrXSv>=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe#
z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O
z9Q>~WehlJzx%<bPQa3UG@9_Y=nVH?g_~^+75oLLv$ie@Y#2=cI>p{`;8_egPI>Sfi
z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93
zobhKf{v@`mZ6;fFGG1<?NC^7@jb|dyX%EvkTxXD6z4{8{5BSFWgE{#8km<+qJbowF
z@e1P~?=y(+G5*aQeC$z1pZ6>^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO;
zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W<M;A9
zP{t$eNlWrSu-Gs@kJn4TVEi-efQ6p6ASC+Z*nxeQ<@u2Dudg$Rsf-U78u|eHNxHtJ
z<0#<Ck4gMx85eb$-q3h|PhvjB>}Te4Chb<J^)?*wlB-Qj|7DhEIp<!)c!Yq^$Hm+)
z*E9YqUO!8}f1UBp%|Y}B9d~QIfxW-aFnxK0!OHyjE5`fx4H(YB=Ut{h<8y}3JzUA?
zBBRgJIR?SaQxV25*>3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5
z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP
zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr
zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T
zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV
zvr`Y|;PVXA7r$=k+5M}Z<e-0p=&`QwUmv}lgPuZNQ+p+hl|1+>0p8T8#dLnEnEs=q
z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO
zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr
ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h
zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+
z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ<
zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T
zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR
zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV
zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt
zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&<VR<Q2!K=O9c9LV`u3
zi18CWp^-T>E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2
z677gWUo<qFw0gy|<D+%eb=7knV~BFZJA_9<v2+@av6PN97O7*pA;twlSczG26e|ie
z#-Rv3q=<Jy5vwwyU?EB}0(r5tCXNP|>W@<pEkqun__!3COotH5kcFaWb;iM(qp3tu
z6S1TC8}Wn=VnUecil&nh7)u6r)5*uvux_G!q3)wlC<vB<P#PJv5I54Q*C;4#qLf7*
zRZKysK$=db^C*xWq=K~Ch7g#gUHZtB<*;NF(9VtOVM??i?iSjwHYXD%##Fj^h}hQV
zlNf;z>kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0
z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR
zCe7^i(ABoe;!~g`Z<In$tF#=(I2|E$Y#*kP5OCcHJ+&#lK12mVs4YZvp*U^nSzMUn
z-{^Qp6wZ!uT}cyyMh^_Ky%Zw+a3HZ%zuUEx<x!_2Tw99{qZ3Ef5>hd}Zr5>%RU%-S
z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z
zqQ0<pv@+6sg|P@_BGWip(3)W@(eZnXj|CSRhT7jUM?D<c2cToeL4jnTPsbqgwv%Hs
zrHju@yk_a)YohVNk#+PSHlJxD!l~+*PU(EG#7Oo=I}zUoPASdPw8fo~i+GrA=oj)m
zCM1^2qZX4ff*0vvkx>e1C|$|s<i?uP@dhPV8z6iRg<tYU({UNksjl~!34<OR&^k71
z^$rW_Cx7@pL<((PxG=ie1l)70po!U;(P+YSr<G2xF`HmpZaEs$X{y3vOSc>Sr|ZY`
zYpTqw(bGwer-^Rq?cXGt@eKiPxK<tqYcy7<BPK?Z-SK!I(dZC@D9cd=(}nHXMDeN8
zaUMZPmE^EveF)bT=P-tzd^2CVX6kqV=>tNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN
zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW
zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE
zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS
zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c
z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7
zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d&
z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E
zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P*
z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1%
zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~
z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D
zCt}PyDO1$kIx!^#MhysC-wK&@aZF398n<oq-ID6Ic*{TsC}W8Ztdu#aqcTcXOAAGs
z=M)7)6&P9)5M3Z3bIQCw^*_Z-L;<?hDff5feQ7yQ@-gJwq7HT%AbLb3<>kF<IY%5y
z1IYbNLDvHCmY4Uk<UCyn`Rp&h9QgM?`J`9srwwE2kTXnAA0zlI_v<*R)&RhxCrAG0
zxxAe99*W4P+ppAiDIW$*2i@MZ@h|nud*5=tK{ALKhhKm6c<&RB{PKRdoG+w8I7EJ7
zC+AJb``A-n-XoXuZe7~QPh~`YM&nyidJof;U*0#DbA$z!`lY<Ie;1cu#th}XbU6=m
zhf#UD7w#|rMc}9kzP;_dzkVS<Y?**O@4@UG)MbFY<!|D0a$ZZF<2i7fRv&V{8Yypi
zc^_WRA}<$n<QM#xedXo3shsb<NJ^Ts)GOFqeC6f+dO729e%j%0{~cUj>_OhUkn@X?
zwBo8EJ*n<Nn%Xbr<$ZlQkMmXNORC3w<>kG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj
zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79
zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ
zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK
zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0
zgx-7yASw6!ua$<f*f(JP<tZ&=-z|-IOinSBH<cM|1x~B*FX|BgPyR`4&rq7*16`*4
a$T41QX8%!kw(<{dHRaDRCx?kIss10uN5`!I

diff --git a/legacy/splice_offline_beams b/legacy/splice_offline_beams
deleted file mode 100755
index 728af8c0f771d3c851b2b05a00201bb8672e361d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 32432
zcmeHwdw5jUx%b*L37KSanGnDrSO*+1D7kUJh$aX)@kB%}iWM9tlSwi%nZ&t3LOC^D
zwB|r6X{qISYz5m>slCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R
zp7TA=_s_Rs_FC&*zxTS<UVG2p@9f;Mw0W7$Fa&RQaiyTv2A85V=#E1PT|v<x=7~b=
zJ>n8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmw<HiB
z+1hcDY@&3_3YQ199S3)+e&!=G@%3oF9?i$)X>Ayn>>v3?pVeBfwOoQe+PEYvZ7Y5s
zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r
z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl<XgaHqog6$;#NX+h1OL*+2cZ?^x~OS^b}T
z&2f%+`Ge};yln$=bPn`E(A+ub&7kGN*Ns4*HUhnT1p1XD(7Q&UPac8(H0W-8tlL}=
za@p@3fxdJE`Y%9t<73@cf<QJ|H$Mou^a{+208N8AqOGet7!f{SEbb3%^aVOL`r7=V
zun73W;jVy)#iL<#w*|sov7qSG#NJpu*eTjJM?>+TXp08@tq>6C@%!3B5q~(eB}nqR
zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8Tk<R;r$^&8V3zd`+vb^R)(}!S)bT
zMps{Vb+{`MT<vcOlZy7vu80=uQ_!41#DM0sP5!g%lG{|~VGi5z5rU7m`1nvEtqgZ*
z+L%zNgpPzS=yu{l>bHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y
zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+I<IZix6ed3umAf^
zbo2Um&_pNQyd5&p5$M!**hHsehqoaUy_kt8pEJ=hfT``6iOyqAo!%X9I=s8`6P0e^
zy>lpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IA<cVQyQ+`Wr3f;+Lvi&NxX$Vga
z$@b5wO(8saNVcD+Hihowe%XGC+7z;rdu01@YE!6A_R045s7)a{*)7`-Q=6v3$qlmo
zAhjt}Cs)Y!eblBYak3%B-?j&Bxi#L@c;k;y>uFrmxO&YhZ~wdhLAEy!#2nsjZ}z~>
zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~
zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr
z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3
zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF#
zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt
z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L
zca_wEE~QyS(Q%XfPr{^S<S7}8Q!>6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe
zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n
z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4;
z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs=
zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW}
zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~
zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT%
zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo
zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz<AR#q83L2Dm6}Z%)O76R
z`cR%J{#qse*Er6U7w?=QFG^;R44mMwh7QY)rfo;5nS4?9FB*;h<QmkM>{{&!ul;o6
zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn
zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2
ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieu<IyO8^6jzGk))Y4_
zAoHJ0{Lrfc;N9_Top*lR<4FAD$sI5(*$uVzZ#DJ5c&oSnx85CZoL;fIc_9CJTx;MQ
zwn`R@UI1oz?2Tn^@m|@*OQ>g2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^
zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i<T<;y|FrkX<W*kdS?`PI;yB9k-zilBi%H-}
z^7<Bi2$)FJdUq^3O0_j)PqBB`qUX^{{`GzM>4gi==D&!xLFV+qEb;CUh=e}`Uw@LW
zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF
z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j;
zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F
z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_
z4=KJRXf)R5=<SwWIjx(7u00&DhP!ha^!B^bpnJ4nYq)An25mGOG)@_GQ>H<*{wB{#
zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf
zj<B^EbIDsjB&WbLAJ#lwcHdCh(x>PvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI
zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1
zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@C<dmQLXBB4;&@qW#bUP%-
za3?RM&Rr$9XekX!isT!9XL$HeJJh<ot7LvA0x^SY<*gnE^fVA2t%)A&R(s|pp9U)p
zZ37nMj}aR+w7W{)E5xe#5OwY<Nuo~kf1XL@Zge3Akc$08&&mJ7R}xy`0sn!9YYS8Q
zU8JABz)G*dS3ZWUX!g`4uQXW^F<B8ID<&x`TC~_YlNF=b3Ky(cCv6$yIXRKO9_=cr
zAHj~w3_HMyK#(=XWDUM1CJ&uX$K*Kd<(Qmsz?v^oC)R(|DJE4WYkJTnV^S%tA)BOR
zSFz{hP59E1EC4{Ccj)_fQ~xXU?Yntk%3b;LR9a2ts>c4mHub;XJni@LUc{4swI|z8
zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP
zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e
zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj
z_jcp<VW6WPHzC{FL?qDRkHkczB@~Z|xV%}2!r$2~BB8bz>sY#c2^^5yX}X1h#qY}v
zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal
zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB
zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP<YguXKwPnuDjy`eKg$pm6>6y&@
z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~
z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8
zsMN98DK?<F0(}z^GZzJ8ajEMb+v3v7&)ebnK;Gigs@?gGrFD0WYAl_%qhM9(p#pn@
zQF^#wed$nv{R{c1HG*baR#Z9<u*TA=#ieerL(F0sFo>jB(;S$Gi|ZijCLK#tI=rOA
zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4<p=}9GNNoCX
zVCs2*e#|NOn7mNIQqKiwEufA2|8j1)i`x7gkG{XtwjxiJboYR^!+NsNH)7h}QiX8p
zQkx^j>NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!<c;
zQ~C#Wk`+s*Y;Vx=u!M5I<ir_Mwy)IvINW8M{m|n7uNC~f*edW|?f5rz`ER;>R+lg9
z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCS<Grv-R_8
z>Sj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8
z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>M<sDo1VsBH)bpk+CW<Y8H&L!d<nk_UB6?mv
z9v}*PALf4Ev=xML<iCoMw@(8wZ#wC)mr`~209CgV=Kxjn9itvYQQ$a={iq*eXER16
zfRu#74`9df{J*S1T#6<^c>epJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd
zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kY<Qg&Br9JwRyhBel#G&%
zwa&kTt@#e)URbyRU#`$7GVVvC!}%83Rcw&K-Oi09%_SRg=W^0B+IRpxJ<eIw7-vuz
z`<zdZv<r-z(Ae&riG6;h@gSt_a0W@hMB{sC-0fURJX4H^0oda_Oy*P@6vlndA5o)5
zHugJd%IDY0#zE&#s8J^yhnz1_V}?-&p2N-<iJmEK8*;uuJ+lp3wVoFCv*;RC@C(3-
zPY~+9!tX$JW&VEhw`db#4^S1OL){OO=Z>NU<i$70aTi852wH)IWRx7mjyx!(2|?6b
z(U8>Vm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+<w+8M
zx2(PeD+~H%wGu)d7m{Hm9{@EEBQ#n#s!^0*1s#qFM8VYkr7Q|4zS6ojw581jI4bf^
z1L|^OMiicuL351*p=h%0PM``LG_T4JL6OrVtItq%2I(x{085;6Wpx%+7gE)J6?Vmy
zw^Fqk)Z+1U%-XlZm*NYzQ}uF)FP^X$)e2Y1FH6udW(|NPugGd4+9k&?BI;PG{#P~j
zqU8H9hKSPaGEwpr+SH@nDESH6i@0qUg_V%#D*45YAQg>*!je~i2Lxh5pz=S#iIU$@
zXZZ%~O5UJq3Hi4T6)DpY<r@)`(g$uQ6rD3l_e)owhn&&_bg-9q67?GrV591HCEx`D
zJVF2&>|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS
zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z
zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf
z+{47z$Rv$CcEC*{#n^o&cE^L7_<v07N!I=#;LLu8iI-x+loctD*0Q1p$WBc>!N^sl
zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r
zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u
z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f!
zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U
z$8Uj8#<JsRV(YoEB^*mN_vcS|oqDe!G2cb^geS2xnyC2!Y7>#8B#djxirWk<XMxUE
zVYK`^dhLU!dq4Fofm>dX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^
zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa
zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu
z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm
z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x
z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#<eyvBw3alQ|#;7Gy8Z?H(_*x}3
z()n(ZwO?spW0HL>@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY
zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0p<G5C50U-k_G(^v(7
znPlx(wx2RtuoL*~$d$oz+Gu2D<bGg@+|<i55qX1WdPdQp>XBEzFev3q*B48)FZZx7
zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D
zQi63zj>}9`<hU?6?aP$}CSQEOOtSVXUyhjUm<W8fFTXP(<hZ<O_2pb9B5yEh@nsw4
zPd<I|!WL15FmAsbi_nyP=-NfT450E|_JzUC4E7Wen0%oV(IjiX@}*v@pc$0R>B|i!
zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e
z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq
z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV
zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%<P9FralAaDK`CDn
zw~U5)biDjM%#(9$Kl{SqlrNqI1SVfzAYZKg%9qznc2okN?aNscLi+M&t1shloblKK
zkvDj^WscGDa*+nLE37a!U0R`iIm5n8z@?fT7Y3(&8AV|7<r-inS^JeQO(r|u$mvVS
zgpj^`X!T`hCL(XJ&*BRmFAPfgviV&ykIt1bm?w{ye`H@6obu(epR+H|kT2GL<;x+H
z9fyF=9+zV#g!E<D>dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP
zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H`
zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4<yGoMxXP4QsZFR%onpC4RacU9L2WOZuC7u)
z%jSrxf6!tp+;|oMfpE>f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp>
z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K
zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d
zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM
zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu
zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz
za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2<gc>t0xyC0<`H-vkk7u@}y2f
z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PX<f~>B(xVCkHbT
zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMeP<pcN%V$2>u^C2|l
zc&*ori<oilEHFwhpC>P^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NR<fRF<b5Ut^~Qjq
z)eg*$nYwOdX7hBM&P$H+d|?4R-$Um!+R~oWNmY4XL!JwqxRmELN714$Jks-@XvUQ1
zbv0m=o^J<clC@uXPN!Si$n$r!k>vS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A
z>3Qw1VSsC|L03X`a)_(Q(K<Q2Ri8ms)Y8uN1>@@}$YcrGi<lXcX>}IWf5PN&eendY
zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f)
z6u)}lRU|~!WKE~|t<v<tjG430gs{Z#MpM@wW|r~O({B3h8lmF1fjqz0coNXMAnL}L
zAHt+d=qHQDI2phC4-u&Hg^z)3r+aL5HUvPm>z0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr
zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*<JXXAKZSN3
z-G{YZ{4wsB%@K2HfcS}QH+pF^?tdFBGWy+!{=CP4u?@CX4d_uSa0_t&2#)u+HJ##p
zMAI|kUG<C!aU#X3j@IcsYh*m<(nRdj5gwH0kU_8FF}^%G{0s)kMYhJJ=gwC(coBm=
z4-=Ty)*FDCWbIdT=WCjaeCY(9eR)dL$(MUHJ;Rr&ubB{*x%0lM>jh@cSar(vp{;zS
z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR-
zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e
z0<jOD(SmAMG-58(r(n13idQhB=~K~XEIf{2{S=?kf@)XbtReGW#!RqXu>f%Tj482S
zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt
z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH
zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$<cgtQXW6
zSB+O*UOK+1qTCo=&`|m1#pB5kJW6K&ZHUbHw2+Ai-i2s7RCK60A>B*hiRoTyUSw)Q
z{ijS|G%t1<Yc3#-ZYM@$bOBW-sVW{#IPrYqk~BXK^^1&pY3W+$%`(PQ@<n|GshQfp
z-GGF8@L&25+mTjUIk*_|tC&oJrxEu`C}i%-NIP!Z&w>zqcL7!?5!L3{WJIP0k<q@B
z+FCOZK0r)eI(Arr&HYUkZs#l+-x(=#KE<BNxKS#@j?u<pp*6|I?6g`e*l<kDt<j-K
z4ca_%gNAs1ji9*^DOJKEn9vI{NSI*@GYN#bB7=k(vM5b}!mB~CiJxLf4C8ER3HAo+
zg4(r-ShO}234{}^!CJi9!8d1?e$PXDBvC8VXe9(czOQW$1hOG>nOl6as9MU_jb#8c
z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG
z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@
zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS)
z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cI<E**lyqA
zSgTrhJ07rIHg>x`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H
z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2
z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6
zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb
z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql<p?O7
zF)5UBu}?;58)&S}{#k7tvHz1bd@|y1w81(CztLmh_yZYlzwL4R1jpk$cG|DQ;9?T|
z(s%=t;ya&p?6G&_Q(<eOXxZMe*FG+1RV)~3^|f{-<eNj9uJ_SPOMKn&D8JJL?<t}8
zlK5lsIvnI%g1)%$#X4|gw<Ox|su5Zs<`K<Dohb0|-Ul%=eD(OT1eS*=y@BL<AJD!Q
zKVFqm+tJk-tW`OgwOBH4t8I<>>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j*
z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhh<j2r-j@j7>of%zh7nf8u!)p^(qzAUES%A
z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=<M3m
zLTvtE410og5U4v4?o5!D?yl}YBu;>qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm
zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh
z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye
z{5aVe2)Bh3m|RkvluSm+3dK<gYP~&GZINb4wVNocDe<9HAQjK1RQyzX7W%vO&}ho0
zNF+q!0^!XqN>t}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^
znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~
zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D
zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d
zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb
zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u%
zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdm<PK
z^!noJwM`JDS4}KmhzqY|nXin(!Qqc$Bz^MrQa-#@${!OThG--qLis_X6zNVN{flU%
z9r5L%@JD)60T3ZtN+AxNz644Po?Om@kxd~?FcC<EF}4(Yxz^8_g<kz?tnMzvIemQz
zzvR$>NE0s5)fH{kbp#(HE>eNRIVCDu<Qc>Kqc{~_niWz1W}o~49L2pw|Jn_2%fkJu
z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ<
z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR
zU=PiGL|aCbESCqQJV&b2u@8%50K<bppl&(I)bZNZNxa?u)_PT7N>OX3DrTu-cKU~l
zzC<Ju3%1UngNBU31llAzgPnAg;^_DHhC}Tg@CP%Yolc_AMu%#mTTSYWA1H?DLO}lR
z5JPWz+ZkkHJmdkSz8Co7bR1$J=>$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx
zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W
zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~%
z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe<MUw7
zV<V?}=CqOd7?dw4{WHr<E6gV9KM{lQTD|4R*E~GrE!SU7jozoZ+NK?b<=^h*#vQ3q
z_%v5}U>$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N
z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V<?P5Tr7V&2DbI&(!p$n$
z=wUr=(2hJrRZs6(D`yZwBIh$^X=jO?0eL>`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M
zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO
zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2
z4iY(sE=xO0<gB~%VPDo-U)5!f5oeX0)&_B$@|LBV#c`6{`LIiH3z#;qE_0}6m7Eg>
zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQ<kNjC2|(p`LHanR-XWX<z%od
zxtzAeSy{kx_SJ~k_w;F+R<7O$xWy6-u|Dmhs&P`n1f5r82tQXjMP!tC*Z!CCX%BF_
z3(q2G1C*<r)WJ{%Nbj??wB*z>I+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{
zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q
zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^
z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ
zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT
z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCX<TSd&z|xy4C6f0KB>icfma`AMwS
zWC5od<q*sd-*)Tl7;tg*)f8zJS6|a4WjD9j_Z;Fmi{`;p4AyYfnHu3$TrJilC7xU5
zX<*u0Rm&Ei6&6mtVYP6OtDHrX!*0%qX-Mf_!_{x5NUON|aE9I7;@p$JNqp8c<TM&q
z3%j|>88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2
zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h
z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x
z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms
zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-<
z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9
zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y<Ly*hgCxAzKTGqV!
zsg#pFuU^r3{yP@3iQOM%C3~L#6?7xVJgh)KM#J9*eZ25;bBUz0)O0x;^m4?r!R?kr
zWZTH?YL=rfb0}W4U8m{1nszI)SO-NUhhJ2|hyBgKyEVpK(EDQCIgXbE@Fa&{<$$K#
zK7)!rk(<Sc&ucut4}$IAtLbaqZi#1m?$h)Q7W&sTonMu}_=h#UTjN>&A<$j0-=*nn
z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ
zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_
z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_
zXdI7O<h-Qm$1U{NHT?w3?^7DyWcq%$B=C6sS=0GFFHHXkbcIy>ye~wSAK7!-BBzY#
zNCb7TUd=F3;?w7A_Um%cX<gzkcKlPfxmx}ii~Oq?ufL=QwfyTPovllk0WC)ya7*F}
z-SuOXF4*sSP|-D6bZPvG1B%XZ_&3CX?BuJ$%QZtE@KTP&-`$`Sp08(ITF{`Dzd_4q
zd%hy^TBxZwr16I|p6z)|(+^wdk4rhFS-%x{R^x{>-lcW?Leq~~<h(BBXppHmO?Zs2
zb$yF#Imfje9^a2acboXRIt~DuKd)Qx^b#wgpRmxUfj*g;O=Xj`GyD2;sg`qE%h9Jj
zK`&z?`TUY2j^|CF=UU%JN{$Ug@gCenTO0CZ(|-la7w`1pF?$5rxOl4F<!cXjwIF?<
z71=T|pFhzf0$rWm;b0sIsdMMdpO+g%f2u3w^GBopULQU4??uWOUfAnvO>}l5Z31Z*
zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;?
zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D
zf6%EI=lcU<DkVhbOk@gHnK3FACiS3RXVj>-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yu<Y
zW@n^vvohb7tWprt-Q>MWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g
zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ
z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0<qono5V%x6hfD*q2%|(8
znIj`(s`E-S61!5b(^h{bFO%Qr!=JL_!-xLHUCuvTr*lOzGJ!Jw%wDF@R0>n-1>PAc
zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f
z@W(pv>+s$P<f#%#wX$bZFe<Zp%ncv<qQS7A7<8>W92Yfm*lJL%Y41WaPH97Uxp*W5
zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1<v$
zp-5X7*ZnOm(cmU-QC1_@5hbot=B?({l1%4}|4o&?<{9|lazJi;pNC8Q<Sb(&ta-Zx
z4Z0=5{Cq!%OSc(}I!;R&TRmE4e!g$Sr6&WO&A$Zwc*S`a>))p<T(bT7S`MEdSuB0e
zrbqwGe^^(z+@~F;hm}-iem?)+2pUe=DSm!`GM8_m&%9aVM|mLhE@IZt_qDj3teM$<
z#&Nk3{q$~QZu9*wE*mtz)qYJDTTr1fVSc_(#^sO(vVN=oJ2n4ejicYm(#9pf@0s}N
z8IYC#^Po{o_+c==@0v?}Sqy~Q$X4^#kA|6_@6T~LWCo+o*NI&2wea(OJTBQ@7Q<~W
zzhdF%>tinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L
zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`<pLJh!?0C3q;r%+G%ZwfiEH
zmY#2_m2NZtZ_q(=kNNpNDF1!u)B1qr`N@1dj{Cq%5a#Fmr5*YfBlVj%=I4^`dztxD
z-=E!zi}kP!=HvEhREWm<-EOz+a;ZX`-X@!=ohGN+e?g7(*QF?`!u-=Lbi&fc<vCMb
zcIe%Fx2iPgHW$pt?Gg~IH0tYhE5r(2s#Z33{0RI%ZcrREI51j(RoA5>@SnX}F<iq7
z-w6COHNQ1}i<=a~0n31~elD*ZLI3Y=Q2bYwYsDkzZyJGrazHU0%Vf`rHQ8Y~L?y==
z&7Y)`Beu!-%)p1;sU#w{LVPSCmh=-{vJg?rg28^PUi$8un}4NVcy6*VSeoL00P(di
AKmY&$


From 9fd4c10b4caa3b49b7b7acc0fd2a78e591e8dbee Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:45:28 -0700
Subject: [PATCH 10/30] Add CUTLASS example

---
 src/dsaX_cutlass_interface.cu | 315 ++++++++++++++++++++++++++++++++++
 1 file changed, 315 insertions(+)
 create mode 100644 src/dsaX_cutlass_interface.cu

diff --git a/src/dsaX_cutlass_interface.cu b/src/dsaX_cutlass_interface.cu
new file mode 100644
index 0000000..fc68d55
--- /dev/null
+++ b/src/dsaX_cutlass_interface.cu
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "dsaX_cutlass_interface.h"
+
+DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
+  problem_size(options.problem_size), batch_count(options.batch_count) {
+
+  // Allocate device memory for batched planar complex GEMM  
+  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  
+  ptr_A_real.reset(batch_count);
+  ptr_A_imag.reset(batch_count);
+  ptr_B_real.reset(batch_count);
+  ptr_B_imag.reset(batch_count);
+  ptr_C_real.reset(batch_count);
+  ptr_C_imag.reset(batch_count);
+  ptr_D_real.reset(batch_count);
+  ptr_D_imag.reset(batch_count);      
+}
+
+// DMH: Replace this with data from DSA-FTD
+void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
+
+  if(testing) {
+    uint64_t seed = 1234;
+    
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+    
+    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  } else {
+    // DMH: construct DSA-FTD interface data transfer interface
+  }
+
+  ptr_A = tensor_A.get();
+  ptr_B = tensor_B.get();
+  ptr_C = tensor_C.get();
+  ptr_D = tensor_D.get();
+  
+  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+  
+  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  
+  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+}
+
+Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
+  
+  Result result;
+  
+  initialize();  
+
+  // Configure pointers in global memory
+  struct {
+    Element *base;
+    void **ptr_real;
+    void **ptr_imag;
+    int64_t batch_stride;
+    int64_t imag_stride;
+  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
+  
+  for (auto const &tensor : tensors) {
+    for (int idx = 0; idx < batch_count; ++idx) {
+      
+      cudaError_t error;
+      void *ptr_real = tensor.base + idx * tensor.batch_stride;
+      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
+      
+      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+    }
+  }
+
+  
+  cudaEvent_t events[2];  
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  
+  // Record an event at the start of a series of GEMM operations
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+
+  // Run profiling loop
+  //-------------------
+  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+  // dispatch routines.
+  //
+  // Note, for planar complex array GEMM kernels, all numeric type arguments 
+  // specify the data type of the base real types. These are understood to
+  // apply to planar complex representations of matrices in memory and to complex<T>
+  // structures for scalars.
+  //
+  // See tools/library/include/cutlass/library/handle.h for more details.
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    
+    result.status = handle.gemm_planar_complex_array(
+	problem_size.m(),                                 // expected GEMM M dimension
+	problem_size.n(),                                 // expected GEMM N dimension
+	problem_size.k(),                                 // expected GEMM K dimension
+	batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+						     );
+    
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+      return result;
+    }
+  }
+  
+  // Record an event when the GEMM operations have been launched.
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  
+  if (handle.get_last_operation()) {
+    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+  }
+
+  // Compute reference in device code
+  if (options.reference_check) {
+    
+    result.passed = true;
+    
+    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+      // Define the GEMM through templates
+      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
+	(problem_size, options.alpha,
+	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+	 cutlass::ComplexTransform::kConjugate,
+	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+	 cutlass::ComplexTransform::kNone,
+	 options.beta,
+	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+	 );
+      
+      Element epsilon = 0.1_hf;
+      Element nonzero_floor = 0.1_hf;
+      
+      result.passed = BlockCompareRelativelyEqual
+	(
+	 tensor_D.get() + idx * batch_stride_D,
+	 tensor_D_ref.get() + idx * batch_stride_D,
+	 batch_stride_D,
+	 epsilon,
+	 nonzero_floor
+	 );
+    }
+    
+    if (result.passed) std::cout << "Reference check passed." << std::endl;
+    else std::cerr << "Error - reference check failed." << std::endl;
+  }
+  
+  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+  std::cout << " GFLOPs: " << result.gflops << std::endl;
+  
+  return result;
+}
+
+ int main(int argc, char const **args) {
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  
+  Options options;  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Compute GEMM
+  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
+  gemm.testing = true;
+  Result result = gemm.run(options);
+  
+  return result.passed ? 0 : -1;
+}
+

From 6c3532322a416fba2af5cd58562b70046cae63e9 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:46:15 -0700
Subject: [PATCH 11/30] Remove code from src

---
 src/Makefile                      |  208 ----
 src/cuda_correlator               |  Bin 34272 -> 0 bytes
 src/dsaX_beamformer.cu            | 1128 ---------------------
 src/dsaX_beamformer.cu.wrk1       | 1003 -------------------
 src/dsaX_beamformer_offline.cu    |  933 -----------------
 src/dsaX_beamformer_passon        |  Bin 178600 -> 0 bytes
 src/dsaX_beamformer_passon.cu     | 1057 --------------------
 src/dsaX_bigfake.c                |  320 ------
 src/dsaX_capture.c                | 1080 --------------------
 src/dsaX_capture_manythread.c     | 1115 ---------------------
 src/dsaX_capture_manythread.c.bak | 1053 --------------------
 src/dsaX_capture_pcap.c           |  852 ----------------
 src/dsaX_capture_thread.c         | 1107 ---------------------
 src/dsaX_copydb.c                 |  273 -----
 src/dsaX_cuda_correlator.cu       |  309 ------
 src/dsaX_dbnic.c                  |  435 --------
 src/dsaX_dbnic.c.bak              |  381 -------
 src/dsaX_fake.c                   |  320 ------
 src/dsaX_filTrigger.c             |  559 -----------
 src/dsaX_fluff.c                  |  415 --------
 src/dsaX_makeFil.c                |  276 -----
 src/dsaX_merge.c                  |  580 -----------
 src/dsaX_nicdb.c                  |  483 ---------
 src/dsaX_nicdb.c.bak              |  434 --------
 src/dsaX_reorder.c                |  515 ----------
 src/dsaX_reorder_raw.c            |  613 ------------
 src/dsaX_reorder_raw.c.bak        |  672 -------------
 src/dsaX_reorder_raw.c.bak2       |  608 ------------
 src/dsaX_simplesplit.c            |  362 -------
 src/dsaX_splice.c                 |  201 ----
 src/dsaX_split.c                  |  601 -----------
 src/dsaX_splitup.c                |  285 ------
 src/dsaX_store.c                  |  218 ----
 src/dsaX_testdada.c               |  161 ---
 src/dsaX_trigger.c                |  585 -----------
 src/dsaX_wrangle                  |  Bin 99600 -> 0 bytes
 src/dsaX_wrangle.c                |  378 -------
 src/dsaX_wrangleAndWrite.c        |  365 -------
 src/dsaX_writeFil.c               |  486 ---------
 src/dsaX_writevis.c               |  428 --------
 src/dsaX_xgpu.cu                  |  375 -------
 src/dumpfil.c                     |  294 ------
 src/fil2dada.c                    |  521 ----------
 src/flagger.c                     |  484 ---------
 src/gpu_flagger.cu                | 1547 -----------------------------
 src/spectrometer_header.txt       |   38 -
 src/splice_offline_beams          |  Bin 32432 -> 0 bytes
 src/splice_offline_beams.c        |  132 ---
 src/test_read.c                   |  279 ------
 src/test_write.c                  |  452 ---------
 50 files changed, 24921 deletions(-)
 delete mode 100644 src/Makefile
 delete mode 100755 src/cuda_correlator
 delete mode 100644 src/dsaX_beamformer.cu
 delete mode 100644 src/dsaX_beamformer.cu.wrk1
 delete mode 100644 src/dsaX_beamformer_offline.cu
 delete mode 100755 src/dsaX_beamformer_passon
 delete mode 100644 src/dsaX_beamformer_passon.cu
 delete mode 100644 src/dsaX_bigfake.c
 delete mode 100644 src/dsaX_capture.c
 delete mode 100644 src/dsaX_capture_manythread.c
 delete mode 100644 src/dsaX_capture_manythread.c.bak
 delete mode 100644 src/dsaX_capture_pcap.c
 delete mode 100644 src/dsaX_capture_thread.c
 delete mode 100644 src/dsaX_copydb.c
 delete mode 100644 src/dsaX_cuda_correlator.cu
 delete mode 100644 src/dsaX_dbnic.c
 delete mode 100644 src/dsaX_dbnic.c.bak
 delete mode 100644 src/dsaX_fake.c
 delete mode 100644 src/dsaX_filTrigger.c
 delete mode 100644 src/dsaX_fluff.c
 delete mode 100644 src/dsaX_makeFil.c
 delete mode 100644 src/dsaX_merge.c
 delete mode 100644 src/dsaX_nicdb.c
 delete mode 100644 src/dsaX_nicdb.c.bak
 delete mode 100644 src/dsaX_reorder.c
 delete mode 100644 src/dsaX_reorder_raw.c
 delete mode 100644 src/dsaX_reorder_raw.c.bak
 delete mode 100644 src/dsaX_reorder_raw.c.bak2
 delete mode 100644 src/dsaX_simplesplit.c
 delete mode 100644 src/dsaX_splice.c
 delete mode 100644 src/dsaX_split.c
 delete mode 100644 src/dsaX_splitup.c
 delete mode 100644 src/dsaX_store.c
 delete mode 100644 src/dsaX_testdada.c
 delete mode 100644 src/dsaX_trigger.c
 delete mode 100755 src/dsaX_wrangle
 delete mode 100644 src/dsaX_wrangle.c
 delete mode 100644 src/dsaX_wrangleAndWrite.c
 delete mode 100644 src/dsaX_writeFil.c
 delete mode 100644 src/dsaX_writevis.c
 delete mode 100644 src/dsaX_xgpu.cu
 delete mode 100644 src/dumpfil.c
 delete mode 100644 src/fil2dada.c
 delete mode 100644 src/flagger.c
 delete mode 100644 src/gpu_flagger.cu
 delete mode 100644 src/spectrometer_header.txt
 delete mode 100755 src/splice_offline_beams
 delete mode 100644 src/splice_offline_beams.c
 delete mode 100644 src/test_read.c
 delete mode 100644 src/test_write.c

diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 0de1991..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,208 +0,0 @@
-# This is set up for the CORR containers
-
-CC=gcc
-CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc
-CDEPS1=dsaX_def.h dsaX_capture_manythread.h
-CDEPS2=dsaX_def.h dsaX_capture.h
-LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu
-
-#LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap
-#CDEPS3=dsaX_def.h dsaX_capture_pcap.h
-
-CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++
-CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14
-
-
-.DEFAULT_GOAL := all
-
-test_write.o: test_write.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-test_write: test_write.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-test_read.o: test_read.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-test_read: test_read.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_trigger.o: dsaX_trigger.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_trigger: dsaX_trigger.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_filTrigger.o: dsaX_filTrigger.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_filTrigger: dsaX_filTrigger.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-splice_offline_beams.o: splice_offline_beams.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-splice_offline_beams: splice_offline_beams.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_store.o: dsaX_store.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_store: dsaX_store.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_fluff.o: dsaX_fluff.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_fluff: dsaX_fluff.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_reorder.o: dsaX_reorder.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_reorder: dsaX_reorder.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_dbnic: dsaX_dbnic.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_nicdb.o: dsaX_nicdb.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_nicdb: dsaX_nicdb.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_capture.o: dsaX_capture.c $(CDEPS2)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_capture: dsaX_capture.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_capture_thread.o: dsaX_capture_thread.c $(CDEPS2)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_capture_thread: dsaX_capture_thread.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_capture_manythread.o: dsaX_capture_manythread.c $(CDEPS2)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_capture_manythread: dsaX_capture_manythread.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_split.o: dsaX_split.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_split: dsaX_split.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_merge.o: dsaX_merge.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_merge: dsaX_merge.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_simplesplit.o: dsaX_simplesplit.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_simplesplit: dsaX_simplesplit.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-
-dsaX_fake.o: dsaX_fake.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_fake: dsaX_fake.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_splitup.o: dsaX_splitup.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_splitup: dsaX_splitup.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_copydb.o: dsaX_copydb.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_copydb: dsaX_copydb.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_writevis.o: dsaX_writevis.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_writevis: dsaX_writevis.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_wrangle.o: dsaX_wrangle.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_wrangle: dsaX_wrangle.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_testdada.o: dsaX_testdada.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_testdada: dsaX_testdada.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_writeFil.o: dsaX_writeFil.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_splice.o: dsaX_splice.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_writeFil: dsaX_writeFil.o
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_reorder_raw.o: dsaX_reorder_raw.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dsaX_reorder_raw: dsaX_reorder_raw.o $(CDEPS1)
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-fil2dada.o: fil2dada.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-fil2dada: fil2dada.o $(CDEPS1)
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dumpfil.o: dumpfil.c $(CDEPS1)
-	$(CC) -c -o $@ $< $(CFLAGS1)
-
-dumpfil: dumpfil.o $(CDEPS1)
-	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
-
-dsaX_xgpu: dsaX_xgpu.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-cuda_correlator: cuda_correlator.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-gpu_flagger: gpu_flagger.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-dsaX_beamformer: dsaX_beamformer.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-dsaX_bfCorr: dsaX_bfCorr.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-dsaX_beamformer_passon: dsaX_beamformer_passon.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-dsaX_beamformer_offline: dsaX_beamformer_offline.cu
-	$(CCU) -o $@ $^ $(CFLAGS2) $(LIBS)
-
-.PHONY: clean all
-
-clean:
-	rm -f *.o *~ dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_beamformer_offline dsaX_splice dsaX_filTrigger cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge
-
-all: dsaX_beamformer dsaX_beamformer_passon dsaX_xgpu dsaX_reorder_raw dsaX_writeFil dsaX_writevis dsaX_fake dsaX_capture dsaX_capture_thread dsaX_capture_manythread dsaX_dbnic dsaX_nicdb dsaX_split dsaX_wrangle fil2dada gpu_flagger dumpfil dsaX_simplesplit dsaX_store dsaX_trigger dsaX_filTrigger dsaX_beamformer_offline dsaX_splice dsaX_splitup cuda_correlator dsaX_copydb dsaX_bfCorr dsaX_merge
-
-
-
-
diff --git a/src/cuda_correlator b/src/cuda_correlator
deleted file mode 100755
index a8b94c759c2da5b87ab4c1a740138d0ad7d75073..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 34272
zcmeHw4SZC^x%cdDLN)<6A)p39IcUJBWk~{r04f`jz(xZ^Aeu^Xv&n9f)y;0Y9|+dk
zXuujNx9HmsZr|4O>+J`(Z{MqJZ7)(^h{dn^wzp_&FSoTVwY7KS$1P&(N3G`m|8r(`
z&u(_9?Y;N+{(kpmAm^FqKhN{b%yVYuoHMg$&PQ6?*Vt_~p@UsqDaaXfJQ6aGLY(-J
zWEIvZN`*(%i1}g)kYfCKWD2Pn)AUVcT2rTzbAT4%uhDJ-jdqWWXgbG)g{HcMq|7f9
zJSu9^&t7S=nxZBWpdQOQwobC3-7=!-eijJf7z-tXA&)E`^-@YNrSvqdQ03QD*N<lS
zw^^lcwtK*l_mGGddCX9qZO03V0OR{sngaV)dSpb?fC&ptwca(*V>#bzs8E6%Repv1
z;!)+-RG)`M;ZRS*l11Ub!f+^(+`e#oW5dFRCGL3CT`%iR`;=X~zEkF;{b+z=nP=jU
zE@ygK{jyj8H2&n#uXg(Gf9=_O=Z@aA^^Z)u41fB(-ThddO%%6JCAJWMb@*G|`s~Z!
z`SrcGj(ziu%7c4ebv2&aJ?CFO*fr<HAN|f%|L7Z)-}u5Cm6x<W+YZr!p&kXR%#%95
z0G=#_qoDch+*3gQZwtu(Gs=?BPO<<#r+}T?3gF)=P@eGu_CH+!FDhVvpn&}^6tJ_Z
zfc)JB@Wul9_7%XdD`4jf1>`w=82HN#tp)51Lona@dai){PyzgJ3*i4$pq$?<fPb_A
z?kHe?V*z}90Xq*DsPC->?970?fxp}k29VEhPZhAU2=Yd*5%3!e;Qvs-&aZ%1h}ok1
zn(i!XznAzNv3FxOeiZn0T!juOc|G2COaDw2C(AuqcD|zIwS9d(NGbao*GgV#r(Aq(
zZnmAPm7Rxk^4+iSF@<aUFG)K>3|D5`5g4a?lL3D$!Lhln&Rr{fZJXEm0>N0YKNL>{
zW1H7Choh0;W`9pOD15&D!Dz%6Pe8%v6TYjq`8F!K=CD5=55`UPyvodHO)MA`jI8sA
z!_i(CB%^JtNk)1Tp=iXE2oClR?NAb(5p8mfKe0L#@yB#ZM|7wewwj}nzEFQM=9l`}
zX0P?fax>CKS|fo%NwN^_eiUio>R>Dq4AWQJ`-hV7lpt*$2=?BzF&IyV6T;UQii9M(
z(H{v!2b-gVL*d|d;RBqI_=druu3)4;6bVXJI)YZ(B7LU0jlpOPZeQn5#6sJpOq<r-
zbaUvITf)J%$WYQW)(od<YbcdEd!iREy~)=fOe8{sK^8Zj2=rcd8BcA4@Lj)tQ=%~x
zjr)51@!%2~Z>ekDmJe?x&X%Dxal(giE_}TMH^Ey&u}~z@Cz*H*wH1BQp<qNrhY}$$
zC?XmGn+S>m1z(>(6sC{2`+fdIa68<A0zw#ApT$%k3Lg)Ocp{dFCc|RTY#)&G$NGW7
zGcnP3b1al#D~wwdA;L7Dj|EmubI>1(h(5M8UohZL_>tNZkDG~rP|UUMZL6ERx_pb>
zOU$^h-d*ornvQ2sN+8(rk9QsXE0X_+nK1{bL#F7vPmMU_-JuXAN)O#r%7Or~sehSR
z2@?>Q@*Mgs?~@GtB`&!aUAaWOMFR*-KWa%&o);?RooF$Or_zy})<3cEXGkf6mKkiw
zeKu_A`@ET<&{-_|bVw01?Kz}9@`+irZx^$b{WmvsOGCV;tdTMEpvN!XgL6#Qp{k=>
zMp-{y1_Fi!S5=d8H5OcDDe+nhuE$y?G+J=${lCY8Tkn6p7F_$Bb~-G$9y^J5S#UTa
z9l9-eSsE2$z=BV;;6oOCng!o(!EqweVc3GZ(x?!3S@3cTzSn~5bIas?7JRxze!m5;
zwBQFU_zVmFfCcA0x(){|_}Ll+9kbx)Snww;c$Eb|X2EA!@Z%QzJPUrpg3q?#DGRQj
z5t)3_f}d}Zf760rV8MmDj<Su+wct(*ZdmXN3qH?+S6T22Ex2L9FS6h@7QEVm*IMue
z7QE4dUu?lW7W`5R?zP}@cvETKk(Zs`-No;$GK6>6SfV(!8@<vy@~HEuDKN406p|+9
z{~pJ3+>=bqCqKZ%@e>mW^Is&-E<S!t^3RiJ*B&2}{7=cVOOHPw`De(pE06D&{8QxF
zg~#_w{=4Mab;pM#e~3I!$M}%szd@c|b-Y{hUnS2jI^H4qFOX-~9QR26GvwJN$7>~j
zFL`#waYOQVlV=wkuaNv5<k|Jch2(D`&n`E9@?8M)W8~S@#!pE8M)K@p<Hsc5L%x#y
znB=b~KZE=OlHW+4U1@y3<gX&nE;PPZ@~g?S>x>Ueeg*k+$qz}so;<tIc(>#)CC{!i
z-XZx5$+OFhdnA7zd3KfYTFF<EXBQbaBwt3JU1PjL@^<nUkQb8w<CWmq6~<4#!}^~h
zZ;(GB`4`Ev>5m_i{PX15<j2P(|5Ng8>f;Yc{u%OY;^X@z{}lQ8<o8PcyX4u_$A=}~
zbA=)L_JWt8>AI$Cn~t!#HFY*^Zu&oLXq}rz-|>#V*7UH15A*1~{bfEG2E4nA=JRsp
z9o<=71E6l~iI!?P4ud)=`9zWIZsl`#f)V1-!)WB*(O-=IE>(x@N8X$$-`NSdkyP`@
zn~TdoT8;Qb^<#|-Jm?NBFuePUQ53lDnY!0g1{6kN{-r~GZfy_gA)PLe-Zc8eOaHTc
zF7>X_c<N8YAA4!U#FuOQn8Y6)AAs8}_Gany2arBmR6Xv^PJasyljYfq^j%|3qmP$=
zd_F|HyU)4}Nr%{$q?3;P2z5=}g0p;(o)|w|B<u5#*6)z|sFAb$<G+Cv3|v72Y%HdM
zAJD+^w1E%9K>6HZB(r|gA?w{e_0R+JJf?WvSnB6+fjoPuX2e=XCsO|lnRxd+Tzw2m
zDL0tipRN8e;w`%$u6_>m@RQXZFdvfwExUJCp8zOL)Qye)v}N?ImeKa=s#M@@7_OL)
z;#>v8UdWw<-0mgSw9VGzePXAih|2grMiAIC_LD+|cPiu$fKT<n;^<?9r8-JL74Pl`
zrDgBv6JBK8=!JVKQom&-b|<PUyeKurQ`HiA&@{d0)@rZU_Ihd&OnP^JLgn|Mlza$z
z!m5;bCbbn3ExS>nJs+|?{ZecB{oiXDeX^<i{wL)B<K8`&{~Y2XbvDiK8e{HN<vU{-
zUYkb#WGmlUjo1-kctq2l8zxe#D77b1-C?@?B#gK9KCW6kbq%I=%hcAoiPR4vkUoEj
zYH!Q(w_8S^*B)#tZ+$lPFhq8ZjXqYsYZJO&)5sIH*5&UepQfE5Gncy|;oaR*J><Qt
zrMkPX{C8vc`yT#&<_(RBr#`ci_`s>A@=O28>u9Ooe#%>Z>6kjN<@fKDC!N@Fx_&>M
z^T>&!Q{^`lFUKJ3Ex(_o-UXXACjkBmf6pQQebDm_2$bK|fpQ_YXI_JiV~6*lNyu^?
z<Vf*a`Ss77Dxdy=>RQMj!{6DzsSpCl=IWZ%ytgJMPL)?6diWT$5QUu7MFvkveMF9?
z-g*<`LQ%DCcTx3n_KjWthz_v(1J%ya<0C1Y0|(Bich7^>!^qHd$<2SHOLj_xweO%;
zbVJDX&LhtkQKrl5*jarSOi2HX9!Dd1{8ag+Pv&|GjxksV>9O9UaL{+HPD-JY<M{hI
ziv9!8^WVm$to*KSl=06x$$-3c(h=yuNi(Zc@4O*>G_yM6A|sEBegd(kk(X`Bt6R$N
zPgFabM*hu~T;+w6&|!X=`Y1f@MJun^y}5cC0Ct_K)FbfgOXp$WINoT8(MJvq;Uvgm
z2IIz;q{gj~dK*%@MW*gUKC<tmJ^_Kzlo!2}!+gNIc$ysd%XjUD3%$2L%B#VatxX?j
z+S+tOlkezmoH%-vCv4aMMK?L}5{ELE%trQO=CulWjsI_?(pBE4Q?g0DR^Ii@_!s3M
zbCw$=Jyeez%;B(+<IknAA#X({DSZO%B=t6myf<@w*mPugC6tmtZxj!mjh=g>{D<+S
zweEY~(WjaYV(6YYTzkF;L(@_3Xz^3a43T<)6(4!jc*uzhQES~8l^_gXHpv0y@SV8d
zOWpta#6;Ki57jEGkjs+UL7Cl<dH)R$U0p!t7RvNP#@E#KP@sU&E()!NP}BRHu7BwE
z0#Z9EHHT7Lx*oc(fY1nq-uMH`{m^$HB8&XX>s9qs-R_{wk066`A1`3%cFNoj8J7DG
z1%!qvbT@=#xhrv<Vn(@c`Us^0ltQ@|6%e|OLaQJIf43Kq`7mW>Lk8sz6_B}=GH?9e
zEcZv1koBygbca7gp=Tf@%l+xJlsfrl1rBeg)YmArWea-My-N*|dIjv3BL*7N_`MDc
z0egA>f8?W-wByVi>O$-=*(1Z1XmBHsI>%pxN~XQmz0x-N^bKBI6-It@vSV}G6JtZ@
zINm275t2NyA8M&{Ve0pN^8V;xKa6#H7j}bqN8WUKM_<@B`yhIZeB7A6^#~ItPC$4k
zL9z8HpE5G}9cFr`_x6921|E6W0R!IAlio*CtGu?yy-&ZJI6G}^s<wvvsI5oM{D-f&
zXNiI7jp?0GOhwR@wmx2b8z9?jGN&Uauz=?sStXLiFZ~bbrH#Exg^6QZ$8Uj7raW+D
z8??uN^BOOesQJjt1KvID)lOakaXGye_W+~M<L0sg1C9WlNou(d#mFviLaS7|p(t-G
zDbL|cq`m~F<8f>jFQi=<&5Gn`mb?mQWI&c%-fdv`tU4k`1`MV~3uJ4&qls$I=w{q+
z;O?Zh1=sk-2@JseYezA=Q{OqoHu}<+Q!&(_Z|^4W=(~sLdfYMZB^mu!OJU6`5XRrL
zq2+yOQBN?Q@FfNh4MFnAlTzH-H`?J$?Se7y=&$5m(a}Rp(IfXw&^(;F7@oqwe*wi%
zuO<kl9<inrsg#SE@)hQW`y^ieQ2Ci=<*&ss&LVDs18~Q5#n<aG_@0DinN{izSitRi
z;$kFy3P~L7{^ih%>GU3zULpt7mp(u8!->>kWRwcwkZynS`C@c}4;Srw=Kq7+K;zr9
z@G$76w$azyMxSlJ<OTW6{Kz|w)RB{{&MOI{?q|CFwvWEpKKeRtY?~%#KIa`dYV$6C
zDfuEhlI7}S8UJ59nrU0IJ?$xey3UZDW2Sd^McwP;H@<3Zf4s3DzYY+c&u4d7K8m;v
zb9K!tdRGKi^sN|(u874~Y+E6sNz9ZQz0qVOVGJhY38N=y3`OIiL}**EOmG%G6zMmb
zJ35W7g{|gPwFcK(LChpC?oIa8)wK=|C3cvT<89koTer!{<c7pR48|McJFt8(xVU#{
zXz`Fg*2;NimDI>Ne@y)M<R;YB4~6}`!GUNP)4lO}U!7GeH?jV7iHlE{xa4$+OHY^B
zaJs~0r%P-+UE=b>iHm)8wbs<d@!$uod_y3(Ez}!q#W`o_jQjh8D~$PZV+-fwqmlU5
zvJGZrMVVnN>^0ifZ*E=dYi??9-((>01jmg;)Q~G0nPmlI2u%g^{K0;6NyXSwx0ME(
zwSf(t>o?QD`s82_M5BFHX=HDT2aJ~1Yu9(KqofRMNM@0lX)+!(w$w_)ea4!$^-b-*
zP48ddyrBcKn|4Hc4XiAv{QIIY1B)#Fu+fVmYx_|nEpOV-qebzaWMZ<msbM|lXm9g}
zlfk&*j|J(x0-@HZiA0S!N_$CRp*o|65(bt_j6ia5=rp4BN^~HK9tt06Z(*q=-WQCW
zMsu;!yh$!sp`#CCF-9(k#i_M0mRke8?q<lm;HPLWmS_UTU~n)R+hJ_!^M~U>o}I~|
zCYl%s#%>Pb1o9*{ceXScxdvpWabr_!Ym3Ezxj<$_%yttGB0tqwwizF)y|wPv`db%&
z*p$CE80+Eqw8IDow*|vSO(58l>^Gv3@QzD#yBV;UmeEZE(VLCLKq!uC#MvD<`PUqc
z#e!k@0t4Lqq=D5b**T1xLx}+by#xLTmiTb4ae}PPK8!UxDPqp<Z^GIUZS;k<qufJL
zh#FjA;)z1jcFRtP16HKaUX??rfUy@R1MRR6=g?qdW{1lx3YzBSvY%yf&xi#RNvu5v
z4AWq5G!QiA2g*ci*gq5p%!NK<zPq8%7-TO__`~nh>>}Au{M*ohOdqgpn+LGU7)*qa
zW~{!>H|XCE@x^_{I`1v-gVSQsWd8umDZ#ZpLvdBE%;2rYb&NX+EbP_QVmwL)BZ-hd
zYz|Gz12a7`;{E@Wj?FOZzh6=9hv6Jv<9{g~C}2k@yrV#Rt-^V7hIhJ~<Vo^;+;g*-
zFHiNFP&mjzH)3$v77Z*EG327FZzzeeAkLP{CeiUq=OLS@5GV1}z+e|Pn+Nm(#S>#F
z!JWv}W>g-hTjJK}N6g<DE-q1^T^trsi-1$veUi7s2v-+hEj*$TtC$|FS^{*(2e@?X
z??Ft{9fKw>sV_bd32Ybc$ToL~OYV$dNO1ejUmX0$4VVQhaI6Q;HR0Tl_Kn@{tSDP_
zA<~_uj8oAHMXMAw6s=LTR?$XD;R!`eeVil3oBB3k>f3~=Zxg1zO_=(2rEeFezFq0t
zg{f~>`gWynSNe9PpIg7bl;`QqX+qM3M;U5TR1?Zu0~dDk>EEt|{+vD+X5fQK;Yo$m
zDoPpVEu9NmrEs2Slz|UKS>a8^9m_&9`q~Z`x{C3q+k1|<IG_McwI0htz4scddd%~^
z2Cb+2e;1|#8Fc?YU|qZh{-4$Vg|@4%N7nv7?XT<q_$^ApeVRuZJyQHRnfL(i4+DQr
zsy`g<@rUK8(~G<Egg=s~J*|$s)$#Sk{P-o09H_E@|LEX#x_bXVam;(a!I&g@<`L6;
zTK+G8#AJ_nPmR~NX=CeJ;m&HYyw+v`IZap&BLxrZkm3-k^IN1iKj+|xn-h+Lw*l8<
zBF9IDSMWz%|At57T>B-ic}?djTn|4Qzf$4)ensP36s}98@$CxN*CvgBQsMfyJsSUp
z!u1`4#($`AJq~O9WrfR1$~u~{Qe3E5Fa291Ex$nFdhFMDi^8?1HO}`GXa__FjrRj^
zq`j#UK|UpMUHT0Epv1M;GB~fBl@&Ty+?%Q-E_T#d;^c83bt`H4wA+V=YlL&+_Td_*
zjGBT>)7NRGnkM?fT*RWu;+aNtNsAUa#n~#U=pr*t<)TVUe7YsxVE(4AXjzfGUv<o$
z5Jhu}aA$0Ws}&XhP44f!sy=K7^wXK}-@^m;ThOD~{dhljtX!<yq3bp55HFUCwcs(k
zjWFU6W96bMoesJG1YYO%*HVs0dTh1Y@F?b8D~PxrzckG8I-fqrc?SN@-x_lD3q3bD
z)xe_P|DqZG*h`P!`nln0V7WG#UzHl5^|(chAy@Vi6V4SAcPaf^Gd*2LJ>Dr;Y$~9C
zQ0dRH=(CyR>i-THd^jop<4XUS($_@SU(;V8L0^-#4Cw)Nsi;sO&k^(1dKy0fj6TUZ
zP2!-^@3uHBU-^0cp@=FjVrCdtg&I~o^T)VXGz@ADp98}(O=?5xlBOP$(khnXNR0W)
zdlh}7k(M+gtRI7>%!`3`sHf{m%o@DPA(~qCWL;JHtUA`XE_X)HX7fToE=wxv0kz>t
zFMX{8Ki$B=687deW=5G!O!(cHoQz|U!dT|6D|7Rg{=tM1AM`C-TITi#0x`_r`r@Hm
zf=0uVva&L_`3z;a{ozo5#HcgeJ&nesr-4EflbB;(*(_#$Tt6)=5tW}$FGSUgb0+M$
z3ruwbqH<b;m|Nc<E}XlVhb25L<)MLxWjr+Uu)IMS8@N=nOjMSC&3<l=IceW1PTI=c
z+d}aWUY;@Bm?n+wIM;E~Rbch3rmXCevJc(v*pC_7m1Q4BmBrVad)DwASG~_;@h1E|
z+p#hLgyYusA5Tm?0P6hbi3u(vAOHEp!~kg52`oqA{NDvSjP$#HH8F7<bRXzR(5~kv
zCc2S6@au_*yFibF9s^B*ra&1uRx#La*(hw=D{Ql;l{oj>N-BuUF#)zOl@jJ5zq}CD
zN`CA|C<|@7D%QBFt}35;vvXLiI&bBri>oiB1?sop?*Q!TdsgDq{{a5>13yfl#WiG)
zmbsj*Q=_i!Ev{kP)voPLuAz1$?ZjW|*@=nA*uYo2Dn4#s?W+14hu3B7DO&BS`B?E<
zSM8lsnp}+|C0(wQc86<9nX9qMRompMS?w|)3!T+2XA^zPpB(-ee*@1=Ox(tDH@PZC
z?Cq|qwUmT>ZNIDHDykUQxvG%3R>rT-j>DjdZ;{5+ajSegGo_u)f4Q#Jjx~OoeAl^T
zm!`iTL|bwGVq#(e;_z$5RwU8i-v@s3mlG2upk}={xc1xZe^=(JXmUAQ=pK2levkCy
zI4?m08GpB?{q3c{z4Z4wmnW9b*YdnyPXD59lIXzl@!8`|*jLv$opgqt@xU1mobkXJ
z51jG984sNCz!?vm@xU1mobkXJ51jG984oBAoSfs4v7uQeJvrN?#}qCTao{69LoQvY
z0F86c39B6UsIc<fQ-VRy-RS*q^t7hl$7Yr(mfmNLFC;VQ`EdP?|8cyI!Jy}#^*rPs
z-kpdN-)Hxbq}O>*l$gAn+XSAno4lUpyjaac%E=3)>-=)pveMI)9o>-1%qt&I37nT;
z&?}jIiI%}T%_FJCs}!%xT`N;FbKN?Byb31kvwf<mkA)V=7b$tHy-41v(iQw)PV_!@
zxfv~}@JOgZ(KU)*t7xC1+Z4T1(N8J*WktWO=;Mn1MA6?W`nsZ1rknYlqv!%f8x&om
z=(URWDY{M3I~DzuqF+|@+loG}=uZ^=t)j0hI;B$Oujm3r8x&om=(URWDY{M3I~A2q
zvmDkoH?J^iI(zU+M$%a7UgEA@*pQT5-EDOZ?%E~py827Z1f#yTuA#P}cDZ;T44D1Q
zY*r`n{uYCrsFShmLl;`3*^r#JDkx6lVqX$6`4sIo=ckZtx0iegqV}Q^{LWL@oh9EQ
zu}#@0(~4I^+fh`EpFbn5_&k6`zlM0x$PGv+`Zf8Tj6(5xkYeW)=iOv&Q$9}vMMeC<
zh22?X04(C9ZSirMEV>MbDYf6ivE*VLj+ea39r&CQICUKm;e4BkZ*Y>=={(N(n~YaD
z%h5GU-eSDU`3kI;{E>0P`4!5%&3KLTFpQP_iSb(JcZt8lc%yR~@pl>blrE(mVI$)$
zeTfM+$#j%1rG{NHU8O%HQ)Igl0^OzD&aGsMjk~uElzyL#)5g*cmD*@+nq(5C95qT@
zHcpgoFI~nmR@#<;87^Hwu^F~DFn5$b!<3n}ZLl*^dMTMI+nr$UD&=rdGS4;yvbU6D
zZ^<RLJHYHK9c8AMN@joQUNV<S=0NGE$-GZ850u_V#%=pNbPkr5GSfw}Y-6Q=qP05P
z#~|>iC>v$cr%IjxSXJ^_R?^8W+)6&jsBjKa_&!$G>1<~F3#@I0nBrUxY>GTzQ+dK!
z>}elDgeXs?L_e$189=`P<}BGboW*Cu7MjchATDlI=a@^oj-NxM9J@Y&XS9MZUpwYL
zkK=Sv$rDvci!N@yXup@d!-eu!)Y96$Qsav<N)6$<gJNMR#&d0og)?GdQ_Rc!eKP;8
z6x)G=!`I8G%-?6`KgIb~7Jwo%P9kPIiz>N2%l6qvi?O<K5p3GdV%7|258_~}Aj>Xr
zPG3OSrqj?T!JS*n#7e3;?m2N=6>Ic+)zOh6Y%|M?gqU>+;9r29bpayh!u)x&FYqv3
zyo_t$Y+I-OW*j#=UbA)1wq0X)BD1Q0g#)w{Wo_pyrztxutb>JFZy-I1h^XRrkSmov
zHagp^yC}(3A`S$z9z*0i%(PPpm13x2rg3I!tCLwiIoq}orNBj;?mnO-xupq%ZQe69
z*@2{2Vd?x<M9w>`gsP-D(@?2{FZY>2S~{wP^u2>^;e~LPv~)hoWm@`0UQ2upkbxzO
zgI=I3c@Z*P0V}M7^GR_*k5XR*+^29^w^9dRVw7QSR0*ww&;_?zH0~*+F>2K~rI<BP
z|F+U7oe*>1Q20jRX21C{Qe|%%5yso7;*<mIg2o+i*_6MRQ5Re|<*SU&qn2|y&bKIa
zj9t?W3XliqTDH(qhu%AIUNOX-?Iw6YOMVrWN~fCO*EINJNS2<%A*<B!MGeL&S#^&D
zA5@_1<|Al<vbY<>yj~`s7Y9=6@NDb`htj-&L+NZ2JfSo%j06Tn>G@{H`*j}Yql%?+
zFer6ApuvZ!KF^dqsKATDknF)<nP9AX+I?mEHPs<;(e)^52mZ<gWAmFK!9Pdmf=#f+
zzp2mI7Pvt9H>JWh|KpJ3-;`Gta7~qeQ|H<*z6M(SLx;7^cg>jUaLsm2w^f!@oQtug
zT1=gKB_J49QCz2s&cUd(V(QANZ8LDSD~AY-l90#H?{H0>%48c@=UHjlMu?RvwzzEu
z*)nYjMX}l1W>90=Jkd5|5faZ@fC!53wym0F=6CU|wi(9MC9^12UMw6=U5E-W=@L!n
zrKNJ2RGyKo%zAmMKD}C6n3++sqpoK!N!uWcrlIYVJ=c;7`&H-L>Sk=Tb)MTb_5HG*
zvx-IC3<{n%dWQ|3tV0oJ-+c#OUah)kH5BGhf!S+?^QnMNro~yOxnP8;Yh?0UHS2Ob
zW6ZMxc38YHQ@~ZT2wdbA6pgq{S442qsvD3$T1R$@5DT&Zbj6D^0G!7W(=JZ2P1xRc
zorR+f2%o5Ii<0r!qEMtaoD2jPVJl=`!xFVGa(^Vbs3(LS6Qu}tFkIB%+dElgsn#~-
zlqrEd7lRROJsA$h-Mz?*yC>phv&HTKW{XV|L(zIOp4c%284U-5eTbtkOA$oTl96{P
zYDUfd8O^wS?FUhW%X^ZcaAIL7a;2G704k;sny6i0@f>H_itLUP;;JH>WB*6GWIQ3`
zm&>@*?O4zSe&<Ji1O89Kj3<g5j#5T%?qlroBHO#pnw`#?5pau~KS6@W*@gI+ly7(5
z=SUuMRNP_5yFWI*=!zX3LAX_<xf!JouMqp9eSPs@LIk2oxp!&X2H%>t_Er%H#p<w4
z=>}gMFUQHvAmankSc0$p3I5u|m(aUUlB4GYUF?hSr}%h}kBnF}5FHFIGH=xv4aK52
zE(*l`b#=81;{(BPIL(Cmk=DB?9>Z3r$WgsYmB4nDePMrpTtwn{1y3g7%|N*^r@%{j
zF}%Uaom4mYVnKgE7BR%VKjo>mob-&y5KQ)kqkh!D+$s}$w)PwHAgk3I3&gQ^C-(=%
zUYG&Aw=eJlcFfls4JQX9!iU$uh1!jidu8%Oha$f0jW*@poxV5<D)6RYFg{=@37VhU
zlT)aTJd=?)ex-vq$na_<cju%FLO4&`MW$$J8J3aNHLEW0t`<%RGs148{s7Jv&t=0B
zoSBRlxr?drB_nwKG*Hj8FLwcj5qeT?@wvX!Xs)Zvd3rJy4knV3;1&JBNH7-a1?!LX
z4qRbwP;M;jHx_QFHx|lPG&qEa+{knx&SE0in~261qKm<`>}hycR`?=#i$pSVt6CJc
z-YoNClPodGQeif+><at(2CWrhndKRdL<P1v#e2`FAH0XHOlAF5?R&7XYI4YIE9}2W
zm2C+b590OEOq-Anio}yWJ=~<#w+%1Dp%tr6K|jj&Z@MvSD^{GW+}&F(EyB7LEW8l(
z53<dnU&x&04zvjl5HetVmcb@Eb@b9Sv{^^>gKLq%Z7QElvchJ?bt`HU9Xk488rsB1
zL<Vikrhn6D!aT-Q^fpCd=wm9nF9T^qKgd8TmL>~SOqbs#(p9-kCFQTmWW`QxLo3?E
zb?JOmL?^9L{1kEOZQMvr3b}H5Ole;RmGWssy3-<QcW1OV<z}wE_D?FIOr&quC!NqA
zDVcxE$m$#NNwRo0kfBu33Pooqsy(XfuHZD$_zH!rQdIjgEvO<_D|(Hhxem*fe-Au&
z_ieO$CP_MQ1>T$&cZft@yeK{2X*Gsjdeifq+4vNp=R>pc5|M8n(ScvE>iNfP`BI_h
zF|+Zq^n7PFK6&4BraLezTv6e%ipsbHFZb=O%!j+g{=E3){m^O1feDC%*n=lGI5104
zRg({|6x?kqH#o!$d|x9UJ`*=V`S7#T^Vn8f4snjq^Viw<xq{ma<pu|4qw~$TI>aoY
z_gl8=I0R<N(|t3Of}Nd3hk6{!p!Cn2N&DI}-677milyTYaX~t6jX1J)!p)>Ogppw^
zF^BosX+g}WLtHosBj6Ah6~L<};k?5x60c6=CQJih6O;YSbq6~h?mFgpvQ#*SKzwfw
z?pAp2dPxg#8~iz-{LFGNa9p(IIq$asr~Q*T_V1MPliSaifaj~%bHF+4mHT_(e+5eW
z+{%lA^=IHTE6X{g%Bj~OE(A__y)Gnw+Y0$6DWB;Vcn*@`T4jg($T5JG;dX^{?>GjH
ze@@}tFV2epqr$mGoE3jo;X8Bi*A&iu;H>h~@$kZWaql-PzDVKR@6C#@Q}}QWev`tv
zshd@v)2X!2ecY@#zu!Qdd$(C}PJ|KXer;C#Hwxz#ZC3mb3g@P5R-99^vYpKKNL*ju
zIR8QW+>4C?Zjs@A!1J|}<puC``^gy(X)m80&W_md>)d@;ddv*fk#isU`WKV)+2O1v
zCP*iZt3%Sx<Z+Lat$foWcl^9t;rhEwlw$atv@>};{3h^9QzYy71K|8_hn`^4w;4|>
zTu*M{G|BKU1?;?5055^TeEC)Y&-Z<ibAexo{#dK}?_rg1J@9<r7vY3$KKqwq*3ZZ@
z{;w|}f1R|Sxo(1$p-<v-gr51+co>m<`QB3izhBwWb9UNJmGoQY`YwN~kDL!Gc|DJ)
z<&PkeFW+Amz+a{v`)t7t46Wg<0`i#F5|wtB7*_WWmnfy##H|KYd?^OfO1vzm*R!=q
z8{~Pu^mHV$lVL*vJA(!AeFgAu6u^Heaa><i`_WH9F9K(I^jxnl&#MLOxY2$pc^y&L
zDgAU(G$p?u_<mYG|I`$~mleRjDD$0s-?$0#EdQ7~kNWy`J#n1h?ws@6SHS+wzzuaS
ztYmiq`TGjsPZq#`3Y_IUp~};tGTC3i&o30P^Hu?TCj3MDy1v>EHNZKZ<lgV?LjS6O
zeD3#(mP+}I9qo%I>3_{-=rslK-U9et1@O;E+=ba%k2D@sS$wsC{KEzC7YpFUa0vaQ
zmlm}D%YhpzFDtnWINLdQD`n8*{7Q*SDC=-t0eo8l`@0I@pD$qNn+hM!ectJfCE^Ke
z8iolrpYN*XjlTA_O`GxQC2aYiKD>ktIDPnbP9(_B?gXMf^DQ}F0Fza5pFg=>a38pE
z5c|5hmn~h+38_3n^7}<Te=O$Tfw?ohriQQQV7k+Xz3K+}r6mg|KWUUDniw3~-k0#z
zA}sSa*2)N(g$?rSPMBmgH(!g!+}!!GFIeZsu8^Y7pXdoiTIA<>u5YhzFhAOq`O=g5
znWxS~D9n8&+v{sh0ek?eISEBCX0`Z9Dx@soj8zEJm04etY6_c*bxToxf1*Whl(?Dm
zb?@b+!S+czEoL^fY|q`eGW{aqq?GKJ3foV+rzKKPv#BLgPqV8fQctt3B~nkbuO(7X
zv#}*oPqVWnQctt*2vSe8w`F_o=9ZIu-k#px(zmvK!)j~~>BD@cuMHE$ocFwbQ=+}m
z=L=yQK73XxYcT{{ncjq*z>+;^ylXZ#t!wqQu5aNd%Va0gaeQ9V*XmUQ-j<ER*Yf`L
zP3zj4AxE3(X3npKSpdwB#?XvZnKrChv#E8nZ*$Y?_EzSQ`8buwBfm3+Z%gqjQe4?g
zPwb+_>ThagOPQ-L8H``0;$DIBi(&e6R)u$9RTJTundNu1CY#}RzvN7<oICb$_FT=&
zXZId_G|l6|3J-g%<?C={&2P8mOHI!<s|5l|TR)+ePbd8mGz?7fs1F-4%5H7_##g2m
zbffx;SaxyKpM`6U1oGu#F0{zf%kRqNlPO@afN1)AW|`9QRGCIHqgc9N`OIXREEh1i
ziY33OmTl2oP|3#RZmpK@&t-2Wi9%%0;AfPVFe9#$`F!}YpXwCU&ick&z7PA!C6SDw
zOkMRZqWbH7`6|k9v1RL87PQQ7y=5n7eabF7$z0*e#?W?f9bv86FD)N?la4|w)L+XZ
zpZ%phTnuu*i^uQZDf{XZaaeqd=jO=H;p>ejGvD~j`Szc@8s+;wVLlIKeGM;@6&rkJ
z_3`xIW=t1IevFW9onuD+%%rSzmmEto8Jg!c`!kRHBBZ|E$od|n`9Z!M<;ni!cLtS^
z@+Y=i)Vnk4-5S&TH)1g`7#qS?X;^Y|H?3}4nDF;24$I|kE=2_vh62J(ZorRqNOxdI
z1oljdl~9?Y_7=!ud`ROyuvAd7p>RTAM@S5cZbaSvQAD@{ns7@mx?@rK0YU72#hVD+
zcW@6kvr}&h_7CD_Nw|Xp>IP{bfDANenl&#WrZpYEu@_<}f`2g7i}FPi%o+%9h1~d1
zp};m<gQjV7%Pc=O|Lw;Op4vgn+`v*L_4LGo+jI&pd)O|{Ou;2UN3klWqsmg&-R)n`
zaNd-2NBaI*uX8jiU`~!{%38xkU~o&H*4OJHeCM0NkjKnHKfh|E4iT%qUN=c7{c0_!
zXs&(>aD1I0s>{F6?vaxF6u{r*G2k&y2Hd7+h4tX3;!o?R>>hCZ-MJ*<O_d=Y>!hEw
zetKRNI3Cr~`g*<Q1S^H0f3I$GrVSk@y#asxt+>u#uj@EjD1;kLk}kiN)AS~!^S9@k
z*XuvqmHvLE&+@Y>y8H?joYy9<_4PVYm70LARf4(x)Aiq}^j9l6{rh>Z($~MYr#@!)
z()nxszX8TJ#qu%e_YF>#n>=-_L7k2?G>Vv2U$0M9RG1>ex?)O0!FaYd!zVzj`g$E}
zubLFe^;a(cxg7odl^$t$ztYD~zp{ePSA&0th&6vbAN94lri5NL)$+OZ|9hpctEtz|
z4l4cboC4>t;_Es3dY!GR#>`QE=}u=3nydeCj=o-x8(wKD>GP~j<mz*gp7qz|*Xwrs
zlzu=fDw<pWpXKPM*Y_?r75KYiYsk(21teltL-t|H>3Qj~b>#4qH<P4~x%#gng-w-X
z0E3?IKCbk0<#YLWpu_rWeZ8)D;yvoyq0GLc_4WE=O6gaq`s@Cs^VjWgUrzpdo$}2M
zNI=l?CYRwgJ{9>}%b#A)>@YR>V@+$&dYbZcDpq|x57wd9?{zt~4XvknzPUh5*I&=C
zbt`@TgxneoD}(r4m49sxuJxDZ;M$y~HA+86Ong%3pOc{VG|ZO>axtboru09ct|MAL
zSC&8C&&4P^@BOAmSx)=UmA$5b{$*QC{T(?9Ijq>K^mG09#SfVJpU@}9WYcBUcKCZ-
z9akcfG<KU@+n*^vf4iJd|MnhJf4@o6$6VRzNXaRpl<ne$5rONY){w)>ymN{u#cO&^
jtu78O2p8jzr&PCJJ+5$k&zJx32a$kqTq~IDq`Lk)d9f~T

diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu
deleted file mode 100644
index afdda70..0000000
--- a/src/dsaX_beamformer.cu
+++ /dev/null
@@ -1,1128 +0,0 @@
-// -*- c++ -*-       
-/* will implement the 64-input beamformer 
-
-does N beams of 256
-
-order is (taking time as 8x 8.192e-6) 
-[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
-Load in 16 times at a time, so that we have (in units of what needs to be added)
-[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
-
-This should be reordered on the cpu to 
-[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
-
-The first kernel, launched with 1536 blocks of 64 threads, needs to
- - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
- - each thread processes 4 beams, adding everything. for each beam,
-  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
-  + add everything into output array
-Output array has order [beam, 96 frequency, 16 time]
-
-Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
-
-Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
-these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
-after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
-
-Do everything in floating point until second kernel. 
-
-Second kernel will simply add times and adjacent channels and pick leading 8 bits
-Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
-
- */
-
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <mma.h>
-#include <cuda.h>
-#include "cuda_fp16.h"
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include <thrust/device_ptr.h>
-#include <thrust/fill.h>
-
-#include <cuda_runtime_api.h>
-using namespace nvcuda;
-
-// global variables
-int DEBUG = 0;
-const float sep = 1.0;
-
-// kernel for summing for online bp
-// input array has order [beam, 48 frequency, 2 pol, 16 time]
-// need to output to [beam, 48 frequency]
-// run with 256*48=12288 blocks and 32 threads
-__global__
-void badder(float *input, float *output) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 256*48=12288
-  int tidx = threadIdx.x; // assume 32
-  //int fidx = 2*(bidx % 24);
-  int beamidx = (int)(bidx / 48);
-  
-  // declare shared mem
-  volatile __shared__ float data[32]; // data block to be summed  
-
-  // transfer from input to shared mem
-  data[tidx] = input[bidx*32+tidx];
-
-  // sync
-  __syncthreads();
-
-  // complete sum
-  if (tidx<16) {
-    data[tidx] += data[tidx+16]; // over pols
-    data[tidx] += data[tidx+8];
-    data[tidx] += data[tidx+4];
-    data[tidx] += data[tidx+2];
-    data[tidx] += data[tidx+1];
-  }
-  // now tidx = 0, 4, 8, 12 are what we want! 
-
-  __syncthreads();
-  
-  // store
-  if (tidx == 0) 
-    output[bidx] += data[0];
-      
-}
-
-
-// kernel for summing and requantizing
-// input array has order [beam, 48 frequency, 2 pol, 16 time]
-// need to output to [4 time, beam, 48 frequency]
-// bp is scale factor for each beam 
-// run with 256*48=12288 blocks and 32 threads
-__global__
-void adder(float *input, unsigned char *output, float *bp) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 256*48=12288
-  int tidx = threadIdx.x; // assume 32
-  //int fidx = 2*(bidx % 24);
-  int beamidx = (int)(bidx / 48);
-  
-  // declare shared mem
-  volatile __shared__ float data[32]; // data block to be summed  
-
-  // transfer from input to shared mem
-  data[tidx] = input[bidx*32+tidx];
-
-  // sync
-  __syncthreads();
-
-  // complete sum
-  if (tidx<16) {
-    data[tidx] += data[tidx+16]; // over pols
-    data[tidx] += data[tidx+2];
-    data[tidx] += data[tidx+1];
-  }
-  // now tidx = 0, 4, 8, 12 are what we want! 
-
-  __syncthreads();
-  
-  // store
-  if (tidx == 0) 
-    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
-  if (tidx == 4) 
-    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
-  if (tidx == 8) 
-    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
-  if (tidx == 12) 
-    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
-      
-}
-
-// kernel for promotion
-/*
-orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
-promoted to half precision  
-
-launch with 16*48*NANT blocks of 32 threads
-
- */
-__global__ void promoter(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x; // assume 16*48*NANT
-  int tidx = threadIdx.x; // assume 32
-  int iidx = bidx*32+tidx;
-  int pol = (int)(tidx % 2);
-  int chunnel = (int)(tidx / 2);
-  
-  /*int ant = (int)(bidx % NANT);
-  int time_chan = (int)(bidx / NANT);    
-  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
-
-  int chan = (int)(bidx % 48);
-  int time_ant = (int)(bidx / 48);
-  int tim = (int)(time_ant / NANT);
-  int ant = (int)(time_ant % NANT);
-  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
-
-  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
-  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
-  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
-  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
-
-}
-
-// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
-// for first time, launch with 3072, 32
-__global__ void printer(half *inr, half *ini) {
-
-  int idx = blockIdx.x*32+threadIdx.x;
-  float ir = __half2float(inr[idx]);
-  float ii = __half2float(ini[idx]);
-
-  int chunnel = (int)(threadIdx.x % 16);
-  int channel = (int)(blockIdx.x/64);
-  int tt = (int)(blockIdx.x % 64);
-  int pol = (int)(tt/32);
-  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
-  
-  if (ir!=0. || ii!=0.) {
-    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
-  }
-  
-}
-
-// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
-// launch with 4,32
-__global__ void rms_printer(half *inr, half *ini) {
-
-  int idx = blockIdx.x*32+threadIdx.x;
-  int pol = (int)(idx / 64);
-  int ant = (int)(idx % 64);
-
-  float rms = 0., val;
-  for (int i=0;i<16;i++) {
-
-    idx = 786432 + 49152 + pol*64*16 + ant*16 + i;
-    
-    val = __half2float(inr[idx]);
-    rms += val*val;
-    val = __half2float(ini[idx]);
-    rms += val*val;
-
-  }
-  rms = sqrt(rms/32.);
-
-  printf("ANTPOL_RMS %d %d %f\n",ant,pol,rms);
-  
-}
-
-
-
-// kernel for beamforming
-/*
-
-Assumes that up to NANT antennas (nominally 63) are populated. 
-
-Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
-
-Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
-
-Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
-for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
-use __float2int_rn, cosf, sinf intrinsics. 
-
-Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
-Do it in tiles of 16 beams and 16 ants for 
-
-Output array has order [beam, 48 frequency, 2 pol, 16 time]
-
-inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
-
-launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
- = 24576 blocks
-
-*/
-__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 24576
-  int tidx = threadIdx.x; // assume 32
-  int orig_bidx = (int)(bidx / 16);
-  int beam_tile = (int)(bidx % 16);
-  int stuff_tile = (int)(beam_tile % 4);
-  int data_offset = orig_bidx*1024; // offset for first part of data
-  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
-  weight_offset *= 16384;
-  int idx1, idx2;
-  int f_idx = (int)(orig_bidx % 96);
-  int tim_idx = (int)(orig_bidx / 96);
-  int oidx = f_idx*16 + tim_idx;
-  
-  // shared memory for convenience
-  __shared__ half summr[16][16]; // beam, chunnel
-  __shared__ float summi[16][16]; // beam, chunnel
-  
-  // accumulate real and imag parts into [16 beam x 16 f] fragments
-  // Declare the fragments.
-  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
-  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
-  
-  
-  // zero out accumulators
-  wmma::fill_fragment(wr_inr_frag, 0.0f);
-  wmma::fill_fragment(wr_ini_frag, 0.0f);
-  wmma::fill_fragment(wi_inr_frag, 0.0f);
-  wmma::fill_fragment(wi_ini_frag, 0.0f);
-  wmma::fill_fragment(ib_frag, 0.0f);
-
-  // IB
-  if (stuffants==2) {
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
-    
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-
-      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-
-    }
-
-  }
-
-  // one ant per beam
-  if (stuffants==1) {        
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
-    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    
-  }
-  if (stuffants!=1) {
-  
-    // loop over ant tiles
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-      
-      // copy weight and data to fragments, and multiply to accumulators
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
-      
-      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
-      
-      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
-      
-    }
-
-    // form real and imaginary matrices
-    for(int i=0; i < wr_inr_frag.num_elements; i++) {
-      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
-      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
-      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
-    }
-  }
-
-  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
-
-  __syncthreads();
-    
-  // copy back to shared mem
-  half *p1;
-  float *p2, tmp;
-  p1 = &summr[0][0];
-  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
-
-  __syncthreads();
-  
-  if (stuffants!=1) {
-
-    // now do thread reduction using multiplication by unity
-    wmma::fill_fragment(final_frag, 0.0f);
-    wmma::fill_fragment(b_frag, 1.0f);
-    wmma::load_matrix_sync(a_frag, p1, 16);
-    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
-    
-    __syncthreads();
-
-    // store
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
-    }
-
-
-  }
-
-  if (stuffants==1) {
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
-    }
-  }
-  if (stuffants==2) {
-
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
-    tmp = 0.;
-    for (int i=0;i<16;i++) tmp += summi[i][i];
-    if (tidx==0 && beam_tile==0) 
-      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
-
-  }      
-  
-}
-
-// kernel to calculate weights - needed because weights are halfs
-// launch with 256 threads in 6144 blocks
-__global__
-void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
-
-  // assume 256 threads in 6144 blocks
-  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
-  int tidx = threadIdx.x;
-  int f = (int)(bidx / 128);
-  int cc = (int)(bidx % 128);
-  int pol = (int)(cc / 64);
-  cc = (int)(cc % 64);
-  int beam_tile = (int)(cc / 4);
-  int ant_tile = (int)(cc % 4);
-  int beam_i = (int)(tidx / 16);
-  int ant_i = (int)(tidx % 16);
-
-  int beam = beam_tile*16+beam_i;
-  int ant = ant_tile*16+ant_i;
-  int i = bidx*256+tidx;
-  int widx = ant*NW*2*2 + f*2*2 + pol*2;
-  
-  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
-  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
-  float twr = cos(afac*antpos[ant]);
-  float twi = sin(afac*antpos[ant]);
-
-  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
-  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
-  
-  
-}  
- 
-  
-// function prototypes
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
-void reorder_block(char *block);
-void calc_bp(float *data, float *bp, int pr);
-void calc_allbp(float *data, float *bp);
-void ret_med_bp(float *bp);
-void ret_many_bp(float *many_bp, float *bp);
-
-// performs massive summation to calculate bp
-// input array has order [beam, 96 frequency, 16 time]
-// bp has size 48 - no way to avoid strided memory access
-// returns factor to correct data
-void calc_bp(float *data, float *bp, int pr) {
-
-  int i=0;
-  
-  for (int b=0;b<256;b++) {
-    for (int f=0;f<48;f++) {
-      for (int a=0;a<32;a++) {
-	bp[b] += data[i];
-	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
-	i++;
-      }
-    }
-  }
-
-}
-
-void calc_allbp(float *data, float *bp) {
-
-  int i=0;
-
-  for (int st=0;st<NSTREAMS;st++) {
-    for (int b=0;b<256;b++) {
-      for (int f=0;f<48;f++) {
-	bp[b] += data[i];
-	i++;
-      }
-    }
-  }
-
-}
-
-
-// for finding median of bandpass
-
-int cmpfunc(const void* elem1, const void* elem2)
-{
-  if(*(const float*)elem1 < *(const float*)elem2)
-    return -1;
-  return *(const float*)elem1 > *(const float*)elem2;
-}
-
-void ret_med_bp(float *bp) {
-
-  qsort(bp, 256, sizeof(float), cmpfunc);
-  float medval = 0.5*(bp[127]+bp[128]);
-  for (int i=0;i<256;i++)
-    bp[i] = medval;  
-
-}
-
-void ret_many_bp(float *many_bp, float *bp, float medbp) {
-
-  for (int i=0;i<256;i++) {
-    bp[i] = 0.;
-    for (int j=0;j<NBP;j++)
-      bp[i] += many_bp[j*256+i];
-    bp[i] /= 1.*NBP;
-  }
-
-  for (int i=0;i<256;i++) {
-    if (fabs(bp[i]-medbp)/medbp>0.1)
-      bp[i] = medbp;
-  }
-
-}
-
-// performs cpu reorder of block to be loaded to GPU
-void reorder_block(char * block) {
-
-  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-  // 24576*NANT in total. 1536*NANT per time
-  
-  char * output = (char *)malloc(sizeof(char)*24576*NANT);
-  
-  for (int i=0;i<16;i++) { // over time
-    for (int j=0;j<NANT;j++) { // over ants
-      for (int k=0;k<48;k++) { // over channels
-
-	// copy 32 bytes
-	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
-	
-      }
-    }
-  }
-
-  memcpy(block,output,24576*NANT);
-  free(output);
-
-}
-
-
-// loads in weights
-int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
-
-  // assumes 64 antennas
-  // antpos: takes only easting
-  // weights: takes [ant, NW==48] 
-
-  FILE *fin;
-  FILE *fants;
-  
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
-    return 1;
-  }
-  if (!(fants=fopen(flagants,"r"))) {
-    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
-    return 1;
-  }
-
-  fread(antpos,64*sizeof(float),1,fin);
-  fread(weights,64*NW*2*2*sizeof(float),1,fin);
-  float wnorm;
-  for (int i=0;i<64*NW*2;i++) {
-    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
-    if (wnorm!=0.0) {
-      weights[2*i] /= wnorm*wnorm;
-      weights[2*i+1] /= wnorm*wnorm;
-    }
-  }
-	
-
-  int ant;
-  while (!feof(fants)) {
-    fscanf(fants,"%d\n",&ant);
-    for (int j=0;j<NW*2*2;j++) {
-      weights[ant*NW*2*2+j] = 0.0;
-    }
-  }
-      
-  fclose(fants);
-  fclose(fin);
-  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
-  return 0;
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_beamformer [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f filename for antenna stuff [no default]\n"
-	   " -i input key [default REORDER_BLOCK_KEY2]\n"
-	   " -o output key [default BF_BLOCK_KEY]\n"
-	   " -z fch1 in MHz [default 1530]\n"
-	   " -a flagants file\n"
-	   " -s stuffants \n"
-	   " -q do incoherent beam \n"
-	   " -g skip AGC \n"
-	   " -t test pattern \n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // device properties
-  int nDevices;
-
-  cudaGetDeviceCount(&nDevices);
-  for (int i = 0; i < nDevices; i++) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, i);
-    syslog(LOG_INFO,"Device Number: %d", i);
-    syslog(LOG_INFO,"  Device name: %s", prop.name);
-    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
-  }
-  cudaSetDevice(1);
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY2;
-  key_t out_key = BF_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int stuffants=0;
-  int test_pattern = 0;
-  float fch1 = 1530.0;
-  char * fnam;
-  fnam=(char *)malloc(sizeof(char)*100);
-  sprintf(fnam,"nofile");  
-  char * flagants;
-  flagants=(char *)malloc(sizeof(char)*100);
-  sprintf(flagants,"nofile");
-  int AGC = 1;
-
-  while ((arg=getopt(argc,argv,"c:f:i:o:z:a:tsqdgh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'a':
-	  if (optarg)
-	    {
-	      strcpy(flagants,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'z':
-	  if (optarg)
-	    {
-	      fch1 = atof(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-z flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'g':
-	  AGC=0;
-	  break;
-	case 't':
-	  test_pattern=1;
-	  syslog (LOG_INFO, "Will execute test pattern");
-	  break;
-	case 's':
-	  stuffants=1;
-	  syslog (LOG_INFO, "Will place antennas in output");
-	  break;
-	case 'q':
-	  stuffants=2;
-	  syslog (LOG_INFO, "Will place IB in output");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // print stuff
-  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
-  syslog(LOG_INFO,"Using calibrations file %s",fnam);
-  syslog(LOG_INFO,"Using flagants file %s",flagants);
-
-  // load in weights and antpos
-  float * antpos = (float *)malloc(sizeof(float)*64); // easting
-  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
-  float * freqs = (float *)malloc(sizeof(float)*384); // freq
-  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  int nints = NPACKETS / 16;
-  uint64_t nbytes_per_int = block_size / nints;
-  uint64_t nbytes_per_out = block_out / nints;
-  char * block;
-  unsigned char * output_buffer;
-  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-  
-  // allocate host and device memory for calculations
-  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
-  char *d_indata[NSTREAMS];
-  unsigned char *d_outdata[NSTREAMS];
-  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
-  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
-  float *d_added[NSTREAMS], *h_added;
-  h_added = (float *)malloc(sizeof(float)*256*48*NSTREAMS);
-  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
-  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
-  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
-  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
-  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
-  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
-  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
-  
-  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
-  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
-  float *bp = (float *)malloc(sizeof(float)*256);
-  float *frozen_bp = (float *)malloc(sizeof(float)*256);
-  float *many_bp = (float *)malloc(sizeof(float)*256*NBP);
-  int bpctr = 0;
-  float medbp;
-  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
-  
-  // streams and device  
-  cudaStream_t stream[NSTREAMS];
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamCreate(&stream[st]);
-    cudaMalloc((void **)&d_added[st], 256*48*sizeof(float)); // added data for each iteration
-    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
-    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
-    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
-    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
-    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
-    thrust::device_ptr<half> d1(d_inr[st]);
-    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
-    thrust::device_ptr<half> d2(d_ini[st]);
-    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
-  }
-
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  int blockct = 0;
-  int slow_down = 0;
-  int prestart = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-    
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    blockct ++;
-
-    // DO STUFF
-
-    // calc weights
-    init_weights(fnam,antpos,weights,flagants);
-    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
-    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
-    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
-
-    // zero out d_added
-    for (int st=0;st<NSTREAMS;st++)
-      cudaMemset(d_added[st], 0,  256*48*sizeof(float));
-
-    // loop over ints
-    for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-      // loop over streams
-      for (int st=0;st<NSTREAMS;st++) {	
-	
-	// copy to device
-	cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	// do promotion
-	promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-
-	// do printing if needed
-	if (bst==0 && slow_down==0) 
-	  rms_printer<<<4, 32, 0, stream[st]>>>(d_inr[st], d_ini[st]);
-	  
-	// run beamformer kernel
-	beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-
-	// run badder kernel
-	badder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_added[st]);
-	       
-	// if sufficient bandpasses...
-	if (started>0) {
-
-	  // run adder kernel
-	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
-	  
-	  // copy to host
-	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
-
-	  // copy to output
-	  for (int j=0;j<12288*4;j++) {
-	    if (test_pattern) 
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
-	    else
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
-	  }
-	  if (DEBUG && bst*NSTREAMS+st==10) {
-	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
-	  }
-
-	}
-		  
-      }
-    }
-
-    // now deal with bandpass
-
-    // copy to host
-    for (int st=0;st<NSTREAMS;st++)
-      cudaMemcpy(h_added + 256*48*st, d_added[st], 256*48*sizeof(float), cudaMemcpyDeviceToHost);
-
-    // calculate bp
-    for (int i=0;i<256;i++) bp[i] = 0.;
-    calc_allbp(h_added, bp);
-
-    // place in correct location
-    for (int i=0;i<256;i++)
-      many_bp[i + 256*(bpctr % NBP)] = bp[i];
-
-    // deal with bp for data correction
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-
-      // do median bp
-      ret_med_bp(bp);
-      medbp = bp[100];
-      for (int i=0;i<256;i++) frozen_bp[i] = medbp;
-      
-      // junk into output
-      memset(output_buffer,0,block_out);
-      
-    }
-
-    if (started>0 && bpctr<NBP) 
-      ret_med_bp(bp);
-      
-    
-    if (started>0 && bpctr>=NBP) {
-      
-      //syslog(LOG_INFO,"now using many BPs for requant");      
-      
-      // do average bp
-      ret_many_bp(many_bp,bp,medbp);	
-
-      started=2;
-      
-    }
-
-    
-
-    // finally deal with bp
-    for (int i=0;i<256;i++) {
-
-      if (AGC==0)
-	for (int i=0;i<256;i++) bp[i] = frozen_bp[i];
-      
-      if (bpctr<15) syslog(LOG_INFO,"coeff %d %d %g",bpctr,i,bp[i]);
-      if (bp[i]!=0.) {
-	bp[i] /= 48.*nints; 
-	bp[i] = 2.5*128./bp[i];
-      }
-    }
-    cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
-
-    bpctr++;
-    slow_down++;
-    if (slow_down>=20) slow_down=0;
-    
-    // write to output
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamDestroy(stream[st]);
-    cudaFree(d_indata[st]);
-    cudaFree(d_outdata[st]);
-    cudaFree(d_transfer[st]);
-    cudaFree(d_inr[st]);
-    cudaFree(d_ini[st]);
-    cudaFree(d_added[st]);
-  }
-  free(fnam);
-  free(flagants);
-  free(h_indata);
-  free(output_buffer);
-  free(antpos);
-  free(weights);
-  free(freqs);
-  free(bp);
-  free(many_bp);
-  free(h_transfer);
-  free(h_added);
-  free(tmp_buf);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_antpos);
-  cudaFree(d_freqs);
-  cudaFree(d_weights);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_bp);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_beamformer.cu.wrk1 b/src/dsaX_beamformer.cu.wrk1
deleted file mode 100644
index 5724b60..0000000
--- a/src/dsaX_beamformer.cu.wrk1
+++ /dev/null
@@ -1,1003 +0,0 @@
-// -*- c++ -*-       
-/* will implement the 64-input beamformer 
-
-does N beams of 256
-
-order is (taking time as 8x 8.192e-6) 
-[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
-Load in 16 times at a time, so that we have (in units of what needs to be added)
-[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
-
-This should be reordered on the cpu to 
-[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
-
-The first kernel, launched with 1536 blocks of 64 threads, needs to
- - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
- - each thread processes 4 beams, adding everything. for each beam,
-  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
-  + add everything into output array
-Output array has order [beam, 96 frequency, 16 time]
-
-Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
-
-Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
-these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
-after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
-
-Do everything in floating point until second kernel. 
-
-Second kernel will simply add times and adjacent channels and pick leading 8 bits
-Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
-
- */
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <mma.h>
-#include <cuda.h>
-#include "cuda_fp16.h"
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include <thrust/device_ptr.h>
-#include <thrust/fill.h>
-
-#include <cuda_runtime_api.h>
-using namespace nvcuda;
-
-// global variables
-int DEBUG = 0;
-
-
-// kernel for summing and requantizing
-// input array has order [beam, 48 frequency, 2 pol, 16 time]
-// need to output to [4 time, beam, 48 frequency]
-// bp is scale factor for each beam 
-// run with 256*48=12288 blocks and 32 threads
-__global__
-void adder(float *input, unsigned char *output, float *bp) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 256*48=12288
-  int tidx = threadIdx.x; // assume 32
-  //int fidx = 2*(bidx % 24);
-  int beamidx = (int)(bidx / 48);
-  
-  // declare shared mem
-  volatile __shared__ float data[32]; // data block to be summed  
-
-  // transfer from input to shared mem
-  data[tidx] = input[bidx*32+tidx];
-
-  // sync
-  __syncthreads();
-
-  // complete sum
-  if (tidx<16) {
-    data[tidx] += data[tidx+16]; // over pols
-    data[tidx] += data[tidx+2];
-    data[tidx] += data[tidx+1];
-  }
-  // now tidx = 0, 4, 8, 12 are what we want! 
-
-  __syncthreads();
-  
-  // store
-  if (tidx == 0) 
-    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
-  if (tidx == 4) 
-    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
-  if (tidx == 8) 
-    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
-  if (tidx == 12) 
-    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
-      
-}
-
-// kernel for promotion
-/*
-orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
-promoted to half precision  
-
-launch with 16*48*NANT blocks of 32 threads
-
- */
-__global__ void promoter(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x; // assume 16*48*NANT
-  int tidx = threadIdx.x; // assume 32
-  int iidx = bidx*32+tidx;
-  int pol = (int)(tidx % 2);
-  int chunnel = (int)(tidx / 2);
-  
-  /*int ant = (int)(bidx % NANT);
-  int time_chan = (int)(bidx / NANT);    
-  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
-
-  int chan = (int)(bidx % 48);
-  int time_ant = (int)(bidx / 48);
-  int tim = (int)(time_ant / NANT);
-  int ant = (int)(time_ant % NANT);
-  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
-
-  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
-  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
-  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
-  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
-
-}
-
-// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
-// for first time, launch with 3072, 32
-__global__ void printer(half *inr, half *ini) {
-
-  int idx = blockIdx.x*32+threadIdx.x;
-  float ir = __half2float(inr[idx]);
-  float ii = __half2float(ini[idx]);
-
-  int chunnel = (int)(threadIdx.x % 16);
-  int channel = (int)(blockIdx.x/64);
-  int tt = (int)(blockIdx.x % 64);
-  int pol = (int)(tt/32);
-  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
-  
-  if (ir!=0. || ii!=0.) {
-    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
-  }
-  
-}
-
-
-// kernel for beamforming
-/*
-
-Assumes that up to NANT antennas (nominally 63) are populated. 
-
-Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
-
-Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
-
-Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
-for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
-use __float2int_rn, cosf, sinf intrinsics. 
-
-Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
-Do it in tiles of 16 beams and 16 ants for 
-
-Output array has order [beam, 48 frequency, 2 pol, 16 time]
-
-inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
-
-launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
- = 24576 blocks
-
-*/
-__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 24576
-  int tidx = threadIdx.x; // assume 32
-  int orig_bidx = (int)(bidx / 16);
-  int beam_tile = (int)(bidx % 16);
-  int stuff_tile = (int)(beam_tile % 4);
-  int data_offset = orig_bidx*1024; // offset for first part of data
-  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
-  weight_offset *= 16384;
-  int idx1, idx2;
-  int f_idx = (int)(orig_bidx % 96);
-  int tim_idx = (int)(orig_bidx / 96);
-  int oidx = f_idx*16 + tim_idx;
-  
-  // shared memory for convenience
-  __shared__ half summr[16][16]; // beam, chunnel
-  __shared__ float summi[16][16]; // beam, chunnel
-  
-  // accumulate real and imag parts into [16 beam x 16 f] fragments
-  // Declare the fragments.
-  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
-  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
-  
-  
-  // zero out accumulators
-  wmma::fill_fragment(wr_inr_frag, 0.0f);
-  wmma::fill_fragment(wr_ini_frag, 0.0f);
-  wmma::fill_fragment(wi_inr_frag, 0.0f);
-  wmma::fill_fragment(wi_ini_frag, 0.0f);
-  wmma::fill_fragment(ib_frag, 0.0f);
-
-  // IB
-  if (stuffants==2) {
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
-    
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-
-      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-
-    }
-
-  }
-
-  // one ant per beam
-  if (stuffants==1) {        
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
-    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    
-  }
-  if (stuffants!=1) {
-  
-    // loop over ant tiles
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-      
-      // copy weight and data to fragments, and multiply to accumulators
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
-      
-      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
-      
-      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
-      
-    }
-
-    // form real and imaginary matrices
-    for(int i=0; i < wr_inr_frag.num_elements; i++) {
-      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
-      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
-      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
-    }
-  }
-
-  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
-
-  __syncthreads();
-    
-  // copy back to shared mem
-  half *p1;
-  float *p2, tmp;
-  p1 = &summr[0][0];
-  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
-
-  __syncthreads();
-  
-  if (stuffants!=1) {
-
-    // now do thread reduction using multiplication by unity
-    wmma::fill_fragment(final_frag, 0.0f);
-    wmma::fill_fragment(b_frag, 1.0f);
-    wmma::load_matrix_sync(a_frag, p1, 16);
-    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
-    
-    __syncthreads();
-
-    // store
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
-    }
-
-
-  }
-
-  if (stuffants==1) {
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
-    }
-  }
-  if (stuffants==2) {
-
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
-    tmp = 0.;
-    for (int i=0;i<16;i++) tmp += summi[i][i];
-    if (tidx==0 && beam_tile==0) 
-      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
-
-  }      
-  
-}
-
-// kernel to calculate weights - needed because weights are halfs
-// launch with 256 threads in 6144 blocks
-__global__
-void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
-
-  // assume 256 threads in 6144 blocks
-  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
-  int tidx = threadIdx.x;
-  int f = (int)(bidx / 128);
-  int cc = (int)(bidx % 128);
-  int pol = (int)(cc / 64);
-  cc = (int)(cc % 64);
-  int beam_tile = (int)(cc / 4);
-  int ant_tile = (int)(cc % 4);
-  int beam_i = (int)(tidx / 16);
-  int ant_i = (int)(tidx % 16);
-
-  int beam = beam_tile*16+beam_i;
-  int ant = ant_tile*16+ant_i;
-  int i = bidx*256+tidx;
-  int widx = ant*NW*2*2 + f*2*2 + pol*2;
-  
-  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
-  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
-  float twr = cos(afac*antpos[ant]);
-  float twi = sin(afac*antpos[ant]);
-
-  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
-  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
-  
-  
-}  
- 
-  
-// function prototypes
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
-void reorder_block(char *block);
-void calc_bp(float *data, float *bp, int pr);
-
-
-// performs massive summation to calculate bp
-// input array has order [beam, 96 frequency, 16 time]
-// bp has size 48 - no way to avoid strided memory access
-// returns factor to correct data
-void calc_bp(float *data, float *bp, int pr) {
-
-  int i=0;
-  
-  for (int b=0;b<256;b++) {
-    for (int f=0;f<48;f++) {
-      for (int a=0;a<32;a++) {
-	bp[b] += data[i];
-	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
-	i++;
-      }
-    }
-  }
-
-}
-
-// for finding median of bandpass
-
-int cmpfunc(const void* elem1, const void* elem2)
-{
-  if(*(const float*)elem1 < *(const float*)elem2)
-    return -1;
-  return *(const float*)elem1 > *(const float*)elem2;
-}
-
-void ret_med_bp(float *bp) {
-
-  qsort(bp, 256, sizeof(float), cmpfunc);
-  float medval = 0.5*(bp[127]+bp[128]);
-  for (int i=0;i<256;i++)
-    bp[i] = medval;  
-
-}
-
-// performs cpu reorder of block to be loaded to GPU
-void reorder_block(char * block) {
-
-  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-  // 24576*NANT in total. 1536*NANT per time
-  
-  char * output = (char *)malloc(sizeof(char)*24576*NANT);
-  
-  for (int i=0;i<16;i++) { // over time
-    for (int j=0;j<NANT;j++) { // over ants
-      for (int k=0;k<48;k++) { // over channels
-
-	// copy 32 bytes
-	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
-	
-      }
-    }
-  }
-
-  memcpy(block,output,24576*NANT);
-  free(output);
-
-}
-
-
-// loads in weights
-int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
-
-  // assumes 64 antennas
-  // antpos: takes only easting
-  // weights: takes [ant, NW==48] 
-
-  FILE *fin;
-  FILE *fants;
-  
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
-    return 1;
-  }
-  if (!(fants=fopen(flagants,"r"))) {
-    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
-    return 1;
-  }
-
-  fread(antpos,64*sizeof(float),1,fin);
-  fread(weights,64*NW*2*2*sizeof(float),1,fin);
-  float wnorm;
-  for (int i=0;i<64*NW*2;i++) {
-    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
-    if (wnorm!=0.0) {
-      weights[2*i] /= wnorm*wnorm;
-      weights[2*i+1] /= wnorm*wnorm;
-    }
-  }
-	
-
-  int ant;
-  while (!feof(fants)) {
-    fscanf(fants,"%d\n",&ant);
-    for (int j=0;j<NW*2*2;j++) {
-      weights[ant*NW*2*2+j] = 0.0;
-    }
-  }
-      
-  fclose(fants);
-  fclose(fin);
-  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
-  return 0;
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_beamformer [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f filename for antenna stuff [no default]\n"
-	   " -i input key [default REORDER_BLOCK_KEY2]\n"
-	   " -o output key [default BF_BLOCK_KEY]\n"
-	   " -z fch1 in MHz [default 1530]\n"
-	   " -a flagants file\n"
-	   " -s stuffants \n"
-	   " -q do incoherent beam \n"
-	   " -t test pattern \n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // device properties
-  int nDevices;
-
-  cudaGetDeviceCount(&nDevices);
-  for (int i = 0; i < nDevices; i++) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, i);
-    syslog(LOG_INFO,"Device Number: %d", i);
-    syslog(LOG_INFO,"  Device name: %s", prop.name);
-    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
-  }
-  cudaSetDevice(1);
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY2;
-  key_t out_key = BF_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int stuffants=0;
-  int test_pattern = 0;
-  float fch1 = 1530.0;
-  char * fnam;
-  fnam=(char *)malloc(sizeof(char)*100);
-  sprintf(fnam,"nofile");  
-  char * flagants;
-  flagants=(char *)malloc(sizeof(char)*100);
-  sprintf(flagants,"nofile");  
-
-  while ((arg=getopt(argc,argv,"c:f:i:o:z:a:tsqdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'a':
-	  if (optarg)
-	    {
-	      strcpy(flagants,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'z':
-	  if (optarg)
-	    {
-	      fch1 = atof(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-z flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 't':
-	  test_pattern=1;
-	  syslog (LOG_INFO, "Will execute test pattern");
-	  break;
-	case 's':
-	  stuffants=1;
-	  syslog (LOG_INFO, "Will place antennas in output");
-	  break;
-	case 'q':
-	  stuffants=2;
-	  syslog (LOG_INFO, "Will place IB in output");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // print stuff
-  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
-  syslog(LOG_INFO,"Using calibrations file %s",fnam);
-  syslog(LOG_INFO,"Using flagants file %s",flagants);
-
-  // load in weights and antpos
-  float * antpos = (float *)malloc(sizeof(float)*64); // easting
-  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
-  float * freqs = (float *)malloc(sizeof(float)*384); // freq
-  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  int nints = NPACKETS / 16;
-  uint64_t nbytes_per_int = block_size / nints;
-  uint64_t nbytes_per_out = block_out / nints;
-  char * block;
-  unsigned char * output_buffer;
-  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-  
-  // allocate host and device memory for calculations
-  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
-  char *d_indata[NSTREAMS];
-  unsigned char *d_outdata[NSTREAMS];
-  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
-  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
-  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
-  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
-  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
-  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
-  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
-  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
-  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
-  
-  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
-  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
-  float *bp = (float *)malloc(sizeof(float)*256);
-  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
-  
-  // streams and device  
-  cudaStream_t stream[NSTREAMS];
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamCreate(&stream[st]);
-    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
-    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
-    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
-    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
-    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
-    thrust::device_ptr<half> d1(d_inr[st]);
-    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
-    thrust::device_ptr<half> d2(d_ini[st]);
-    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
-  }
-
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  int blockct = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    blockct ++;
-
-    // DO STUFF
-
-    // calc weights
-    init_weights(fnam,antpos,weights,flagants);
-    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
-    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
-    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
-    
-    if (started==1) {
-
-      // loop over ints
-      for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-	for (int st=0;st<NSTREAMS;st++) {
-
-
-	  
-	  // copy to h_indata
-	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
-
-	  // rotate h_indata in place
-	  //reorder_block(h_indata);
-	  
-	  // copy to device
-	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	  // do promotion
-	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-	  
-	  // run beamformer kernel
-	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  	  
-	  // run adder kernel
-	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
-	  
-	  // copy to host
-	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
-
-	  // copy to output
-	  for (int j=0;j<12288*4;j++) {
-	    if (test_pattern) 
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
-	    else
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
-	  }
-	  if (DEBUG && bst*NSTREAMS+st==10) {
-	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
-	  }        
-	  
-	}
-      }
-
-
-    }
-    
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-
-      // calculate bandpass
-
-      for (int i=0;i<256;i++) bp[i] = 0.;
-      
-      // do standard bf but calculate bandpass
-
-      // loop over ints
-      for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-	for (int st=0;st<NSTREAMS;st++) {
-	  
-	  // copy to h_indata
-	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
-
-	  // rotate h_indata in place - this is current
-	  //reorder_block(h_indata);
-
-	  // copy to device
-	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	  // do promotion
-	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-
-	  //if (bst==0 && st==0) 
-	  //  printer<<<3072, 32>>>(d_inr,d_ini);	  
-	  
-	  // run beamformer kernel
-	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  
-	  // copy back to host
-	  cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
-
-	  // calculate bandpass
-	  //if (st==0 && bst==0) 
-	  //calc_bp(h_transfer,bp,1);
-	  calc_bp(h_transfer + st*256*96*16,bp,0);
-	  ret_med_bp(bp);
-
-	}
-      }
-
-      // adjust bandpass
-      syslog(LOG_INFO,"Final BP...");
-      for (int i=0;i<256;i++) {
-	syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
-	if (bp[i]!=0.) {
-	  bp[i] /= 48.*nints; 
-	  bp[i] = 2.5*128./bp[i];
-	}
-      }
-      cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
-      
-      // junk into output
-      memset(output_buffer,0,block_out);
-      
-    }
-
-    // write output for debug
-    
-    // write to output
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamDestroy(stream[st]);
-    cudaFree(d_indata[st]);
-    cudaFree(d_outdata[st]);
-    cudaFree(d_transfer[st]);
-    cudaFree(d_inr[st]);
-    cudaFree(d_ini[st]);
-  }
-  free(fnam);
-  free(flagants);
-  free(h_indata);
-  free(output_buffer);
-  free(antpos);
-  free(weights);
-  free(freqs);
-  free(bp);
-  free(h_transfer);
-  free(tmp_buf);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_antpos);
-  cudaFree(d_freqs);
-  cudaFree(d_weights);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_bp);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_beamformer_offline.cu b/src/dsaX_beamformer_offline.cu
deleted file mode 100644
index c122d46..0000000
--- a/src/dsaX_beamformer_offline.cu
+++ /dev/null
@@ -1,933 +0,0 @@
-// -*- c++ -*-       
-/* will implement the 64-input beamformer 
-
-does N beams of 256
-
-order is (taking time as 8x 8.192e-6) 
-[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
-Load in 16 times at a time, so that we have (in units of what needs to be added)
-[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
-
-This should be reordered on the cpu to 
-[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
-
-The first kernel, launched with 1536 blocks of 64 threads, needs to
- - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
- - each thread processes 4 beams, adding everything. for each beam,
-  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
-  + add everything into output array
-Output array has order [beam, 96 frequency, 16 time]
-
-Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
-
-Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
-these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
-after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
-
-Do everything in floating point until second kernel. 
-
-Second kernel will simply add times and adjacent channels and pick leading 8 bits
-Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
-
- */
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <mma.h>
-#include <cuda.h>
-#include "cuda_fp16.h"
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include <thrust/device_ptr.h>
-#include <thrust/fill.h>
-
-#include <cuda_runtime_api.h>
-using namespace nvcuda;
-
-#define sep 1.0
-
-// global variables
-int DEBUG = 0;
-
-
-// kernel for summing and requantizing
-// input array has order [beam, 48 frequency, 2 pol, 16 time]
-// need to output to [4 time, beam, 48 frequency]
-// bp is scale factor for each beam 
-// run with 256*48=12288 blocks and 32 threads
-__global__
-void adder(float *input, unsigned char *output, float *bp) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 256*48=12288
-  int tidx = threadIdx.x; // assume 32
-  //int fidx = 2*(bidx % 24);
-  int beamidx = (int)(bidx / 48);
-  
-  // declare shared mem
-  volatile __shared__ float data[32]; // data block to be summed  
-
-  // transfer from input to shared mem
-  data[tidx] = input[bidx*32+tidx];
-  
-  // sync
-  __syncthreads();
-
-  // complete sum
-  if (tidx<16) {
-    data[tidx] += data[tidx+16]; // over pols
-    data[tidx] += data[tidx+2];
-    data[tidx] += data[tidx+1];
-  }
-  // now tidx = 0, 4, 8, 12 are what we want! 
-
-  __syncthreads();
-  
-  // store
-  if (tidx == 0) 
-    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
-  if (tidx == 4) 
-    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
-  if (tidx == 8) 
-    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
-  if (tidx == 12) 
-    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
-  
-  /*if (tidx == 0)
-    output[bidx] = (unsigned char)(__float2int_rn(data[0]));
-  if (tidx == 4)
-    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]));
-  if (tidx == 8)
-    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]));
-  if (tidx == 12)
-  output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]));*/
-  
-}
-
-// kernel for promotion
-/*
-orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
-promoted to half precision  
-
-launch with 16*48*NANT blocks of 32 threads
-
- */
-__global__ void promoter(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x; // assume 16*48*NANT
-  int tidx = threadIdx.x; // assume 32
-  int iidx = bidx*32+tidx;
-  int pol = (int)(tidx % 2);
-  int chunnel = (int)(tidx / 2);
-  
-  /*int ant = (int)(bidx % NANT);
-  int time_chan = (int)(bidx / NANT);    
-  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
-
-  int chan = (int)(bidx % 48);
-  int time_ant = (int)(bidx / 48);
-  int tim = (int)(time_ant / NANT);
-  int ant = (int)(time_ant % NANT);
-  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
-
-  //inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
-  //ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
-  inr[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
-  ini[oidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
-
-}
-
-// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
-// for first time, launch with 3072, 32
-__global__ void printer(half *inr, half *ini) {
-
-  int idx = blockIdx.x*32+threadIdx.x;
-  float ir = __half2float(inr[idx]);
-  float ii = __half2float(ini[idx]);
-
-  int chunnel = (int)(threadIdx.x % 16);
-  int channel = (int)(blockIdx.x/64);
-  int tt = (int)(blockIdx.x % 64);
-  int pol = (int)(tt/32);
-  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
-  
-  if (ir!=0. || ii!=0.) {
-    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
-  }
-  
-}
-
-
-// kernel for beamforming
-/*
-
-Assumes that up to NANT antennas (nominally 63) are populated. 
-
-Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
-
-Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
-
-Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
-for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
-use __float2int_rn, cosf, sinf intrinsics. 
-
-Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
-Do it in tiles of 16 beams and 16 ants for 
-
-Output array has order [beam, 48 frequency, 2 pol, 16 time]
-
-inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
-
-launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
- = 24576 blocks
-
-*/
-__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 24576
-  int tidx = threadIdx.x; // assume 32
-  int orig_bidx = (int)(bidx / 16);
-  int beam_tile = (int)(bidx % 16);
-  int stuff_tile = (int)(beam_tile % 4);
-  int data_offset = orig_bidx*1024; // offset for first part of data
-  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
-  weight_offset *= 16384;
-  int idx1, idx2;
-  int f_idx = (int)(orig_bidx % 96);
-  int tim_idx = (int)(orig_bidx / 96);
-  int oidx = f_idx*16 + tim_idx;
-  
-  // shared memory for convenience
-  __shared__ half summr[16][16]; // beam, chunnel
-  __shared__ float summi[16][16]; // beam, chunnel
-  
-  // accumulate real and imag parts into [16 beam x 16 f] fragments
-  // Declare the fragments.
-  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
-  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wr_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, half> wi_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> final_frag;
-  
-  
-  // zero out accumulators
-  wmma::fill_fragment(wr_inr_frag, 0.0f);
-  wmma::fill_fragment(wr_ini_frag, 0.0f);
-  wmma::fill_fragment(wi_inr_frag, 0.0f);
-  wmma::fill_fragment(wi_ini_frag, 0.0f);
-  wmma::fill_fragment(ib_frag, 0.0f);
-
-  // IB
-  if (stuffants==2) {
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
-    
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-
-      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-
-    }
-
-  }
-
-  // one ant per beam
-  if (stuffants==1) {        
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
-    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    
-  }
-  if (stuffants!=1) {
-  
-    // loop over ant tiles
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-      
-      // copy weight and data to fragments, and multiply to accumulators
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
-      
-      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
-      
-      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
-      
-    }
-
-    // form real and imaginary matrices
-    for(int i=0; i < wr_inr_frag.num_elements; i++) {
-      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
-      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
-      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
-    }
-  }
-
-  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
-
-  __syncthreads();
-    
-  // copy back to shared mem
-  half *p1;
-  float *p2, tmp;
-  p1 = &summr[0][0];
-  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
-
-  __syncthreads();
-  
-  if (stuffants!=1) {
-
-      // now do thread reduction using multiplication by unity
-    wmma::fill_fragment(final_frag, 0.0f);
-    wmma::fill_fragment(b_frag, 1.0f);
-    wmma::load_matrix_sync(a_frag, p1, 16);
-    wmma::mma_sync(final_frag, a_frag, b_frag, final_frag);
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, final_frag, 16, wmma::mem_row_major);
-    
-    __syncthreads();
-
-    // store
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summi[tidx][tidx];
-    }
-
-    
-    // do thread reduction for each beam    
-    /*    if (tidx<8) {
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1];
-    }
-    if (tidx>=8 && tidx<16) {
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8];
-    }
-    if (tidx>=16 && tidx<24) {
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16];
-    }
-    if (tidx>=24) {
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24];
-      }*/
-
-    /*if (tidx<16) 
-      for (int j=1;j<16;j++) summr[tidx][0] += summr[tidx][j];
-
-      __syncthreads();*/
-    
-    // now summr[beam][0] can go into output
-    /*if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0];
-      }*/
-
-  }
-
-  if (stuffants==1) {
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
-    }
-  }
-  if (stuffants==2) {
-
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
-    tmp = 0.;
-    for (int i=0;i<16;i++) tmp += summi[i][i];
-    if (tidx==0 && beam_tile==0) 
-      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
-
-  }      
-  
-}
-
-// kernel to calculate weights - needed because weights are halfs
-// launch with 256 threads in 6144 blocks
-__global__
-void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
-
-  // assume 256 threads in 6144 blocks
-  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
-  int tidx = threadIdx.x;
-  int f = (int)(bidx / 128);
-  int cc = (int)(bidx % 128);
-  int pol = (int)(cc / 64);
-  cc = (int)(cc % 64);
-  int beam_tile = (int)(cc / 4);
-  int ant_tile = (int)(cc % 4);
-  int beam_i = (int)(tidx / 16);
-  int ant_i = (int)(tidx % 16);
-
-  int beam = beam_tile*16+beam_i;
-  int ant = ant_tile*16+ant_i;
-  int i = bidx*256+tidx;
-  int widx = ant*NW*2*2 + f*2*2 + pol*2;
-  
-  //float theta = sep*(127.-beam*1.)*PI/10800.; // radians
-  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
-  float afac = -2.*PI*freqs[f*8+4]*sinf(theta)/CVAC; // factor for rotate
-  float twr = cos(afac*antpos[ant]);
-  float twi = sin(afac*antpos[ant]);
-
-  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
-  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
-  
-  
-}  
- 
-  
-// function prototypes
-int dada_bind_thread_to_core (int core);
-int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
-void reorder_block(char *block);
-void calc_bp(float *data, float *bp, int pr);
-
-
-// performs massive summation to calculate bp
-// input array has order [beam, 96 frequency, 16 time]
-// bp has size 48 - no way to avoid strided memory access
-// returns factor to correct data
-void calc_bp(float *data, float *bp, int pr) {
-
-  int i=0;
-  
-  for (int b=0;b<256;b++) {
-    for (int f=0;f<48;f++) {
-      for (int a=0;a<32;a++) {
-	bp[b] += data[i];
-	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
-	i++;
-      }
-    }
-  }
-
-}
-
-// for finding median of bandpass
-
-int cmpfunc(const void* elem1, const void* elem2)
-{
-  if(*(const float*)elem1 < *(const float*)elem2)
-    return -1;
-  return *(const float*)elem1 > *(const float*)elem2;
-}
-
-void ret_med_bp(float *bp) {
-
-  qsort(bp, 256, sizeof(float), cmpfunc);
-  float medval = 0.5*(bp[127]+bp[128]);
-  for (int i=0;i<256;i++)
-    bp[i] = medval;  
-
-}
-
-// performs cpu reorder of block to be loaded to GPU
-void reorder_block(char * block) {
-
-  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-  // 24576*NANT in total. 1536*NANT per time
-  
-  char * output = (char *)malloc(sizeof(char)*24576*NANT);
-  
-  for (int i=0;i<16;i++) { // over time
-    for (int j=0;j<NANT;j++) { // over ants
-      for (int k=0;k<48;k++) { // over channels
-
-	// copy 32 bytes
-	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
-	
-      }
-    }
-  }
-
-  memcpy(block,output,24576*NANT);
-  free(output);
-
-}
-
-
-// loads in weights
-int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
-
-  // assumes 64 antennas
-  // antpos: takes only easting
-  // weights: takes [ant, NW==48] 
-
-  FILE *fin;
-  FILE *fants;
-  
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
-    return 1;
-  }
-  if (!(fants=fopen(flagants,"r"))) {
-    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
-    return 1;
-  }
-
-  fread(antpos,64*sizeof(float),1,fin);
-  fread(weights,64*NW*2*2*sizeof(float),1,fin);
-  float wnorm;
-  for (int i=0;i<64*NW*2;i++) {
-    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
-    if (wnorm!=0.0) {
-      weights[2*i] /= wnorm*wnorm;
-      weights[2*i+1] /= wnorm*wnorm;
-    }
-  }
-	
-
-  int ant;
-  while (!feof(fants)) {
-    fscanf(fants,"%d\n",&ant);
-    for (int j=0;j<NW*2*2;j++) {
-      weights[ant*NW*2*2+j] = 0.0;
-    }
-  }
-      
-  fclose(fants);
-  fclose(fin);
-  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
-  return 0;
-
-}
-
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_beamformer [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f filename for antenna stuff [no default]\n"
-	   " -i input data set [no default]\n"
-	   " -z fch1 in MHz [default 1530]\n"
-	   " -a flagants file\n"
-	   " -s stuffants \n"
-	   " -o out beam [default 1]\n"
-	   " -q do incoherent beam \n"
-	   " -t test pattern \n"
-	   " -p output total power time series \n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_beamformer_offline", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  cudaSetDevice(0);
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int stuffants=0;
-  int test_pattern = 0;
-  float fch1 = 1530.0;
-  char * fnam;
-  fnam=(char *)malloc(sizeof(char)*100);
-  sprintf(fnam,"nofile");  
-  char * finnam;
-  finnam=(char *)malloc(sizeof(char)*100);
-  sprintf(finnam,"nofile");
-  char * flagants;
-  flagants=(char *)malloc(sizeof(char)*100);
-  sprintf(flagants,"nofile");
-  int outbm = 1;
-  int outpwr = 0;
-
-  while ((arg=getopt(argc,argv,"c:f:i:z:a:o:ptsqdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      strcpy(finnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'p':
-	  outpwr=1;
-	  break;
-	case 'o':
-	  if (optarg)
-	    {
-	      outbm = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'a':
-	  if (optarg)
-	    {
-	      strcpy(flagants,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'z':
-	  if (optarg)
-	    {
-	      fch1 = atof(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-z flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 't':
-	  test_pattern=1;
-	  syslog (LOG_INFO, "Will execute test pattern");
-	  break;
-	case 's':
-	  stuffants=1;
-	  syslog (LOG_INFO, "Will place antennas in output");
-	  break;
-	case 'q':
-	  stuffants=2;
-	  syslog (LOG_INFO, "Will place IB in output");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // print stuff
-  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
-  syslog(LOG_INFO,"Using calibrations file %s",fnam);
-  syslog(LOG_INFO,"Using flagants file %s",flagants);
-  syslog(LOG_INFO,"Input file %s",finnam);
-  
-
-  // load in weights and antpos
-  float * antpos = (float *)malloc(sizeof(float)*64); // easting
-  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
-  float * freqs = (float *)malloc(sizeof(float)*384); // freq
-  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = 198180864;
-  uint64_t block_out = 15*48*512*256;
-  char * block;
-  block = (char *)malloc(sizeof(char)*block_size);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  int nints = NPACKETS / 16;
-  uint64_t nbytes_per_int = block_size / nints;
-  uint64_t nbytes_per_out = block_out / nints;  
-  unsigned char * output_buffer;
-  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
-  memset(output_buffer,0,block_out);
-  
-  // allocate host and device memory for calculations
-  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
-  char *d_indata[NSTREAMS];
-  unsigned char *d_outdata[NSTREAMS];
-  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
-  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
-  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
-  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
-  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
-  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
-  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
-  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
-  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
-  
-  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
-  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
-  float *bp = (float *)malloc(sizeof(float)*256);
-  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
-  
-  // streams and device  
-  cudaStream_t stream[NSTREAMS];
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamCreate(&stream[st]);
-    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
-    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
-    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
-    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
-    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
-    thrust::device_ptr<half> d1(d_inr[st]);
-    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
-    thrust::device_ptr<half> d2(d_ini[st]);
-    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
-  }
-
-    
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  int blockct = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  // init weights
-  init_weights(fnam,antpos,weights,flagants);
-  cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
-  calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
-  if (DEBUG) syslog(LOG_INFO,"Finished with weights");
-
-  // open data file and read first block
-  FILE *fin;
-  fin=fopen(finnam,"rb");
-  fread(block,sizeof(char),block_size,fin);
-  fclose(fin);
-  
-  // calculate bp
-  for (int i=0;i<256;i++) bp[i] = 0.;
-      
-  // loop over ints
-  for (int bst=0;bst<nints/NSTREAMS;bst++) {
-    
-    for (int st=0;st<NSTREAMS;st++) {
-
-      cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-      // do promotion
-      promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-	  
-      // run beamformer kernel
-      beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  
-      // copy back to host
-      cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
-
-      calc_bp(h_transfer + st*256*96*16,bp,0);
-      ret_med_bp(bp);
-
-    }
-  }
-
-
-  // adjust bandpass
-  syslog(LOG_INFO,"Final BP...");
-  for (int i=0;i<256;i++) {
-    //syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
-    if (bp[i]!=0.) {
-      bp[i] /= 48.*nints; 
-      bp[i] = 2.5*128./bp[i];
-    }
-  }
-  cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
-
-  // open data file and read first block
-  fin=fopen(finnam,"rb");
-
-  // re-open file and loop over blocks
-  while (blocks<15) {
-
-    syslog(LOG_INFO,"read blocks %d",blocks);
-    fread(block,sizeof(char),block_size,fin);
-  
-    // loop over ints
-    for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-      for (int st=0;st<NSTREAMS;st++) {
-
-	// copy to device
-	cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	// do promotion
-	promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-	  
-	// run beamformer kernel
-	beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  	  
-	// run adder kernel
-	adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
-	  
-	// copy to host
-	cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
-
-	// copy to output
-	for (int jj=0;jj<4;jj++) {
-	  for (int bmn=0;bmn<256;bmn++) {
-	    for (int j=0;j<48;j++) {
-	      output_buffer[blocks*512*48*256 + (bst*NSTREAMS+st)*48*4*256+ jj*48*256 + bmn*48 + j] = tmp_buf[256*48*4*st + jj*256*48 + bmn*48 + j];
-	    }
-	  }
-	}	
-	
-      }
-    }
-
-    blocks++;
-
-  }
-
-  syslog(LOG_INFO,"blocks %d",blocks);
-  
-  fclose(fin);
-
-  float pwrs = 0;
-  if (!outpwr) { 
-    fin=fopen("/home/ubuntu/data/tmp/output.dat","wb");  
-    for (int i=0;i<8192;i++) 
-      fwrite(output_buffer + i*48*256 + outbm*48,sizeof(unsigned char),48,fin);
-    fclose(fin);
-  }
-  else {
-    fin=fopen("/home/ubuntu/data/tmp/output.dat","w");
-    for (int i=0;i<15*512;i++) {
-      for (int j=0;j<256;j++) {
-	pwrs = 0.;
-	for (int k=0;k<48;k++) pwrs += (float)(output_buffer[i*256*48 + j*48 + k]);
-	fprintf(fin,"%f\n",pwrs);
-      }
-    }
-    fclose(fin);
-  }
-   
-  
-
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamDestroy(stream[st]);
-    cudaFree(d_indata[st]);
-    cudaFree(d_outdata[st]);
-    cudaFree(d_transfer[st]);
-    cudaFree(d_inr[st]);
-    cudaFree(d_ini[st]);
-  }  
-
-  
-  //  free(block);
-  
-  
-  free(fnam);
-  free(flagants);
-  free(h_indata);
-  free(output_buffer);
-  free(antpos);
-  free(weights);
-  free(freqs);
-  free(bp);
-  free(h_transfer);
-  free(tmp_buf);  
-  free(finnam);            
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_antpos);
-  cudaFree(d_freqs);
-  cudaFree(d_weights);
-  cudaFree(d_bp);
-  
-}
-
-
diff --git a/src/dsaX_beamformer_passon b/src/dsaX_beamformer_passon
deleted file mode 100755
index b08ed99873c198055c7e078c5d1cf0100e9af070..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 178600
zcmd3Pd0<r4_5TDCh=@*5w5V93MjKoc6(v>bL^Aq<K_iQ@nm`DPvczNp!GZ=SQKn<s
zXl;uH3s!6WTCGd14G|%%iF@2ZP@+<O&rpfr5){qv^SR5LJK>F6`^PVuym!vM=iGD7
zJ@?#mml<vf7F^OJDXFJV{q*qt!k2{i>X89~k$-)@nUC}kF#iO6Lw$$f?_l34zFvS+
z1db&v3&cG;$qYaGnJPga8%)OU&>j*ov`0X^=;uI*_UWf$BV=QHv7dl?mCq!_P(OXX
zbWu)v%KE@C+28}irV6}%+KTB+Euf-fZPK+Wx>iM}pXo||{nX^tjlW}5`IsI7K!gK)
z`0~Wx*pqenQ~2-DG`_bUEeTtXzF6S(bCQx%KQ-Nzprf4M_a{wN9HrVzWS4PDe*M(_
zaMtWuQ_nm1tl87fm_2K5Wz`u~L(e<oymQa2n0MycLT~bC-AgYYD_SBO%9oB`%G@8n
z^Z}MvrUfTt&g(ocby3#bZ7-&5dgk5QJ6JXczohGp-&HBT^t7bDy&mkB?5pnKOJZ3$
zek<{tdHihyH@#ndT+5O^udFC}dh<Vaob!6qKfXBP`ExJp$k`hmc=o0*j(GgfUmQ`D
zf8CRVgRd8WB;n5ou<sEbr9}|L3%?OcdEs{^!0$`I|CI#zH3|4^O@MELalPdE96EX7
zm%+GR`1J|!_oGw2>Mu*c|EK6kuloN;z<+51y=Euie|ZA_1qt|PC8&R9g8uzy0(rs-
z`0q^6k9Gn+gA?@E>ID3Y6YyD*06#AQpMDAC{AmI{7bmFSkiZ`fB&eU9p#Izh?H-??
zU!F_g54{uUJ1hbJ)CBlF3FMrgfd9P-<asy&|KB9Q`xCVL>je0pB=F~>67WAff&5ts
z@XsaS^UnnB-kQJ;rzha^%LID;0({c(+x_RO1b+1dDtg74-U<AtC4s(wNFe9Q3Gg!#
z*x_#p+Fg-=&ru2Fc`t!p?<8o~pMZa50{n~w?M_XApOOIoRRVbyB#3W|63Fvv0)H5p
z0AHK{|6&4pu1wJG-URrQ66l+fzz<JNQ2$8uV>*7j|Ex~HXJ7(8S0|`{W&-?Q65z)t
z(D$MQ_Ph~-dyTKl67VTY;Q#j|(CeZE_0K}P>D`S6cyIzaM<u8~IzhV=65#y_@JvSg
z`HuIk8a`D>rSttfVTZ$g#YwpWe*xs<Pc8I10rekH^~XuH?_-7c<w-soe<a2W>z66~
z09F5X)ox}`)<-q*Gfa)I^pgV$BOgIkzgY3nawb7e%70-uKEF`)0}8L}Z-bod_m0eX
z{_7P!>(qF>59@-G%4t)|LtG1FXP=qnE6N`;vS?aqdFhN<6`|7dF(Zf1o;SC2%#^9K
zOMOK}Gv>^jTT~GOK~a&fXvF-YQL5hX*;6ViN-HF}Co$U@9V#!KGAFOJB2+$ap^unL
z%1cYd%cZ5Ef+-cDV0rnxa)B5*W%lfOCE`_*4F_$ALXb|kdr9TolF+Pqb49(p()qJW
zO2H{N6e^!JwGy>ix3IMIy4-HuQAyU8ZI#X`DO;#mkDaUKzhp}2;#qU2lq*rul*zk7
z$`;4SWjlOc<=l{BSU9h2IFuPaZ|?M2Gb+ocxbmX5&=>m6nma?Va~Twt&nqh}56z-Y
z<21f%N_jVoD`s%+G~Hxn#ms}-jA!JMiH4;$MlYOOGP8W%+*vocy&@#Me99b&D3}5(
z%)G3$d~WG%!*sb7Afz{yg=V6grqP|?OGTyS<#Xp1p&zBvvn$XQtX2`4R&v^DtaYBR
zXu_DQF3-+_a8*@Prp}t5oqayu=}$$Wg=M8hv*u2p7o=WQQ;JGp$D+!)rB!96C81Ji
zJK^%tp`o+pRTNE~Qc-$t{@huiyzJn7)SgjQRRjZ8%$qx9_N>stqWKV~dqZcV(TZ6f
zZB3XqyO2!S<WzY0Tr?ainF$V~LnMNd^A;4%E}c6gG?O~cD=)mPMAa8=3r8&~ttu%k
z6MZeD%s!_aE-`Bk3P>>g?BJ-&AUt)gs>;sJJ{ua$D#75QFXWeu4rOPT%$!nQ6e^!G
zD^!tR5*$4Re}b?Cb?(M3?`)wo<t?2%&6W1@%MPk}?t+<9LacnS<%SE(O&F7Z5Rp=C
zM&*?9X+`CwvrA!Y$>E2JFMEe=|4>SJ$VjwZHhW4*X>iUQ7<|r;6CnqDrj(VzPC>~;
zbhGGEFof<oZU@hsfgb7BztRX(O0KV*RW70tY(5=Ptb2d|Q2i)Yi3oAQQFG>Skj#<e
zqvHDV&~(BxO5h$MQ2UC;&RsAI9X+bFq7ow$g(Ye*!GX#U%Zf^7UgxWrIcG*Gqfi-q
zDl}c<<hyS=!pmIWJUA$V1_lblF6vKV^;s43N^)|FDk@5*%$@G5KyVDr1{!g{<T^4b
znm%RLY}O{e0`5=}A~Y@JgZv_D!;7a?mVt(|0wxPd5GoJNo907VXx?n!^wN3LeH_mS
z@<3rQp*kHHW2b7CA;x?q2L+xzZ-#G9>6{8^1seR%M@dB)X)6|1pjapgt)Usqpdair
zzhbUZyQpZoX^hacd6gk@WcZ(6GJ78UdX6g-2EC8;Gv@fF-cVXTj{=I2G?gKZ#+y<;
z1Fc}XEcc<!%30HV)1}d-FDRdd=?9K61vu6Coa=-%v}C#NBLId#U`bvmmpxKc#IB{~
zVZ=F8X3b@9%q#Oj*Ah%8gxJIMKi{PV`4<l_I{VDC&m7{rbolV1a}a?AZrnJ$pX(bw
zZd~rg`9;}ho@1iVJUiQ^JlB0wRlx0h7nf}mgB^GKdSZ8w|Md`mQ?O@9s2Be0t={-d
z6?j=k;#n&dzr9h?Q<ReC5Jl5NU?iVZUmsu(_4O58`l5t8lB_LTP5}?r=q30ON10Nw
z%Pnh3N!T`9_W2IOU(y_|%4L0Wm7FD<gc`lUB}wo}!jlm9NRxcak)w0JG#$T{$Id#$
z2VO5AAL9?p|7MgIp0e;zwz<tHXR4{`+s@dIl}|5F$)6-hKOeao%LDig4ZZ`-ruyCm
zJ#nu-aviw$_Weh}*LVH_wNreZ2LCG*eLCt(`Fa}i_42hS`B3cZKWo-uKG?Q5S|R@<
z1$?NlpMrlP;6A?N6ny9HzlWI)@tv$-$v;`}Cw=G`+M|ar3+4RjxM-?)(zYS-CjWFA
z_!$OX=f4m!uD;ISiRWIj{?U0l@$)p2PnAa#UuNKSo=*Jr27W+5%1JZozF={>frnY$
zpG*Uf4s?IA47|?ySvb_dvu*toFz~dI{;>?a>RM4$XyBEJ1%8}?M+dk+#Rk5Qi^chn
zfro>-KV=5KuZ#8hsti0F#{H=_@Q1rtpKqyw_Z#?n15ba^Kg$e!KaIq5m4P?cvkw?}
zuGRI=qXwR57W${bz#pZNcs3dMqYZqEf&YntZ#D3Gt;gc62L2eMew%@>Fz~j4Ki0r^
z82IB1e5ZleXKbQ4H|Dp;8}(BS{7(&hnt>l=;0GA^bOWDm;7>5{nFjtu1D|E!PcraB
z4Sa@y4;c894ZLOGPciU?2L5LTew=~NH1Nd+ez1X`Y2Z&a@MQ-6Gy`8{;7>R3)dv0y
z1HaV3pK0Lh4LtWs^v^N_pQVv_t}^i12L1s9f3|^t)WDx(;2R7)XKww|WZ;KrB%Un>
z{yYQUYT(Z|@LLW1Py^p);BySTZQy@y;5!Wb1qQy;zz;L<zKdi2&pl}UlWO2E(nvhh
z41B=A4>0h#20q=uUu@tr4g7EepJm|l4E#_7A2jd*1AmEuw+#HH2ENe1TLyldfzLPa
z#Rh(afuCvMFEj9E2Hre)tupW<jr!FF{&EAq)WBb1;Oh;1p@Cm!;IB0Bs|-B%3iZze
z27a_g;`yk7A7kJf47@zymW53Q{wkw>i-BKY;9Cv+)dqg6f!BM3EN(OK<Bj^ZfuCsL
zI}H3J1K(-jCmVR*@R<K!W8hN_e35}qGw{U*et?0WV&KyaylhDpW*YbsqkfiwpJw2P
z8u(HJA29IK4ZLOGXBzlI1Anc7A7|jNGw{U*o@dhfXQqL#(nvh5@a|MAno>9LM4wgL
z6iTsqpk{?PrEZj!x`y10qOOx`@Yg>mfH%TILZ__@KPTTnm?@;wD&TU$JqR}m_*%j}
z2|p^}sf3v-I;#XcfiP1<r(VFL2tx>;Q!U^TgqadLWdgpKa4O+q0sovZQ$VLsz-JR?
z>gNOmd@5n4d`^~tPbAD#&q){Xv4ol8IcWkuf-qA%$0y)EgqhMg9p3<Pau32x<(xJF
ze|a2Wrf^QHfcFw+>gF^F_#?th*_=lO{2pPZYR)PFzeSiSno}>}R|qpTbE*aWEMcZ(
zPMLr=5N0ao6brbKFjFw6P{5B9W{Twm1pFXjrdCdtfd5RGDV38h;6D;(D&?dJ_zuEM
zp&Xxpe?^$7lhg4v^}m^LI^i|}-$0nDlG7^Sa>6GPZW8ddgij*;sDP&uW=iC&67U4V
zOog0!0gobl3gK!2k08v{$0-x=#e_2n7Yq33gqi9%g#tdCFjE{SAmCF8GqrKD1biZ4
zrZi5vfR822RK`gY@DYTW!Z<zw_aV&G#pyU8`kydU7N<?XUmgpXsfyDo;Jt*IqBu<g
z{)jMB6X#I@zekuUiL*+;ZxLoH;?xWH6~atGoN56-OPHyLQzqaIgqd<U#R6_5%v8fE
z6!7DOnPNBr0Y6BXsfCjz;6D?-kZ`(y|45iAhLa}XI|wtiaC`#(6=9|nPRCcG{|N^O
zw+Z+L!b~BYRsok2zL;>6fUhMyobaOpo=TXhg0o7%69_X!aOwp-itr_bs|7rQFjE4j
zOu!ctwg?vs_~(R~0yu>NKAZ3e!T|xFN|;l=lO^C233IA<(gl1hVNUT*nt+cW%&Fb+
z3AhhoPU%j^m!kg(b1HY*1pMVOfH{RbtpeUlm{YgYB;bz-bINue74Un6IaNEW1pF3Z
zPSH-ifL|fZsoAL(@Uw(D^*Ut&-awdBu2U@FM#7wGok9UWPIv<0fPfz)%qiE&67Zi1
zbE<XH1^h?CoMN3c0pCHGQ>)_>@UIASN_9H+i~c8EOt?+JHxT9&>a+^DoG_<Or%Axq
z5-uV9sDP&u=2Ypd67U4VoFbii0goa)op7~)M-ZMtxJ<wo6Xpcu6btz0gl7>h6!6)E
zuO%E1@Z(3F=$l>-Sp1A$*MJ*~_GjqYYkjNV?V0PlR{Up0nuFQTSdnLQ8=z#@+EqmX
zpB33?MN;Sw_Ujl5m=)7kxAw{Pp(2S;Sw40un&RSs&;FaP*>Gs$nr%~1bBV5L^-Wwe
zP>7mW>zZZXQnPv#YM!BM9)M;E<-7tld+M4;Bqy%fG#WLx!B#@|)4ru<*_EjIh_1Oz
z$l0W<WJPx7u4aa?_95(2*$X?ny2edd{c~_*g<y6QEE%h_igm`K&iHFqk4{kMLDtDe
zoub@vtIJd+lM9%x{XbUfi%PlUb0@5>OHk_p*81QJ)S5VM^@9m2{gsv0qtdw5&8XrE
zJ3(p4=I&>myHN*n?@GYuFRZf=btrdIiZnoXx$k48F_egM|1?3Zds*vD)`Hv@B&hUf
zRyqWgVDE7W>fFOR|NdOaJ>RXPdQ5iC+P|^Rdenj3R&-d`(I@)s6Jgnj>#8pg_?!i=
z`;Z(~bAaVuoqJUy-*U(1j>(UFI(GD0=3bGn*ZmQ^v$tDIc2^;}n`T9mPv#_UMQSpb
z?PWLFw}YYmJQ`_y5LF}V*V1LOo7OQmv_D0mRktR?CU7ePQ(p8=0R(Gbxp6K_>U*M@
zckExHVq{x(Q{!eZ5iHh$y)V2m$$nJuxJ}eVg*W3W*ms~bdRs;ZnnR%zUs#F4oE4(M
zo9<Xg!GhWUd5V$3{s_XXV?1lF$=Hp@TaDtigReWp>!<3~_9GzWHsD%;uh}<u-t<{p
z0=<0G^CE#{L2pI!GScjJw6xEkcE;!rV8hj{yU*Y6^b8gbkW`?sl1bW}m%(|=*VwyP
zuCE~SL1d5p!lzwb;m)p*KWtA4cOJ7~S69a7wtl(3>Ha;hG*%s!>)Y#3OYwnq%rXr<
zz#=a)JHy)NpU^*Jx<B*Tz5ae<GE(hoIF1!9$Vh`da(kQBv`badfW}F>06j%R+apl4
zj(f^>I#gNbYhowoWfa=|1c*8W$JspqT9FM_BsnAeo*#0dNo)N_J*+R@3cuIWU(*vg
zE2W+AZy&_Zd=@Hng}+SN>mT2;*FRxDtTNPJ^AZwWRk%I;Wskl7Nr#<{`l`}4R2rA;
zuXzXQy{ffq^<s#|dPm55>%puI^@>v);6T2$)hLd<nY)@Z-rzSPd)F=^A}=Eq48EN1
zpF2AvwIDhWX6oNHJ<mTc<xnAB+1qSj$l(A+)+Ilc-3923G*R@;@mqI^=1(TP&;P;3
z>HhIsQ2r9=GhYffC*_Bm(tz7M-9M=X`1RBMWzT&)J=g!uR$w=Q;u$<&obI2{gy##O
zn)D1xGl_j3*k}Alug~=l4s_(@yi~q2sblabIdAxD8X!MR29nH=V8)YAgAi8q!VJ70
z|1-XqwC?qv+7#Xoi>4j++54jQ=P=wR{BDGSY*22-I}OwU1D*Say27N;H1f1c_M&ia
zxT%Nel>M;#sT&N^4wP<Dq@5tW@L!?RC7o!~iY7x0(a=^j^eW!jyRV?3mr=WaeeL$D
z3nDx1@MhdkYFaoTvaPW~d)Yd65`x7pI6ClkE<Q!#p9jA2PcD`oiP}#BYgfP|(I><h
z&5vxsgm3QxrXcc#J#jS~YJ33W!@qPr)>=gV{%L`1708Exw9l3dII{C2&17(|V9;eB
zD;W5fo;QHBUG`UNN$X!)MzXK|CE8^a$d7Cy*=#}fq5VFdtkxx({LtQ{$Z`PZM{i8C
z>Ym8pNU{)G_~eiM*^NNdHX%7_{1^1}*Sv?tB5T{ft`-sp4yPK=3+zp4_6cR8zQ3jx
zTrEUR!7=g<1zEedt1CZxQ<??45N)q7ml9neBv^!a^xiy0e#yb)Hwkjoa_N^SdOs>R
zwm^_roIy97wU5o+m}>tzN|A3$6&Y4d#wzM{gTO695QuDMgqUg1Bqm~8=;zMvtahPk
zE3(y!>=7+k>+RmOQmV3&RrgQ^uhNrIrd78hqY!{S=o2}Kq@&ha>YE?gY=5~&A~OZD
z3Ds<c$r2d2ko|_j3=o)PU^XetP=WDf`RvCPCg5UL5feQ(!-}rSATN;vHRABoXQx1t
zbv|D)yaS4?<3TM&EwgtE5Qm}!Q|-3_7S!FDQKodMDyX|HqZJXmpspsP15XQZ8VU;{
zi!!nbBCR7MUlc^9Wel~#P3Z*)>Vc7w=gBI4WMp%G<i&!>n+1_E8O3&g2v89DaIJ6E
zPki~|O{w<2kBQi`)&~pZhc~C%?+L^gaLEBcY)Q3W27>nXL0gM8ngn8}LTpU6A0Z-I
z*(t_4rsO6Fh}m#}&mMvWki80R+Lz(a+E?JXaKA1ro2+PsNBtDmFJt}VjQTI3enAxC
z=Z80>7UYb{NcY#Cia8kbQwKQ4d~Iapn}R5;P!Oe6qA)}Df9%IPC>{I(^JW%2PS`9^
z5V<d782}j0THn@#`N&d$!aHI3H1r#XAsx%Ua5>u#WgrKdlu?Mm3~pJRAqQA!D6Jr}
z*NVIoZc3uP?H=h=ZcPTAP4zr^Aw@|Qez7Kl9;Zs0D9z{g$1otgconB)6yo(hyyA_y
zJfM0OAjaVJPPd?mgPJ64jWuMhO)AuTK-I(RGWBYo51ZCDRi050t;wLfTF86o$YexE
z<#48OT73JIQxL+2`3^hLSKAdjx-liyG&TnIRnRE>b&|g(4CfYU0J_A|we0hS!Wi1w
zO{=RZR$j)?bUgQ2e`AM~St-5&17Mn(jA~%5=nC;Hi2OU4{rQOKnse#!mq%;P<tR`F
zgWFn>kL{<S$p}>f4SlGX4i-pT0VT&&?|?Qw1av`-qE7)6CE_8f$shvzv0{E?J%>y&
z>ZQh5bSe^iKeR#Au00J1w$0EoB6{b!S$NV_*cBt9w+X35VhR|n*fVSrue$H>s{4e>
zKZ;@t<9ZdJkvlUU5YO8r3}0*$<+o+n0uPR&L3A--uwuJ-<E!=>yz2gE1IrYFXbA8s
z!1f5CPGvDWoif4)DAYQx{pc8En+HcBBP;SmMm3%gZm9ro%P3<pcva!q^@Na3Js7G)
z3j^A_x~?hu-rZFEzv-q?7)$J?TVNBmod#vuP3$hSn`F1ikpp{ucQ<VXLwK#lE^5L%
zeMpR+$ZZ)7grPUP$mkN-=K#G8oy!_V50OcWViMpJRdv6Td4e7kBp_`iiFOUuqijXs
z0Y9L6oH7d+Q?QT*<;;0)>pvnK9OMDqQSkDBCg}k!$^#x%9?+231CDmOx+aVt5q(1R
zf*#6N-BUjM!>-k!Q`5x9ki<fOO||MU_uE+UEx})|hkB{?rBzp7jq20=gKF?#K0OYT
zQB$&&(^|RD%4w;5$EsV(5Q#*xmlfWEg6Aq5IDc6^XEPO8#!!<V{svRWJ=m@&=y{)r
z_u<*-Lw{{GP>q9h;8Kw+d%Iov5d}l;Xh$Z2s^*r#Z4Lf+jz`!du-rCpF2p#RKG
zH}nYo9A!wvnc=3L+_;tNk84sM6we7wECbwzvXuJ;j+ya<Rwyv$Zs@i#$y)Li2X<vx
zng&s706-N8nT-4kHnK1;&dmNCQ@GvmVON*)G>%NLR6o$02RmJ%3$s7BSD_k3Q!Ac~
z3}*wvXX7VWJ;tU^MDwNSEv_x8m9vsn-gK+LUzkvrqKXwg?3<JmeT_Gxo}{-9$@SS2
zc6W6NgD{SBX+O<6XP{W?SB|lkY~t#F;^f>(xs!9R$t~L0cxdnV)^nd`vvx8Xa86>|
zgAb}ef3e?(GENUvun$4i)i;8Q4(M}!j+6=Ov`HDrf2L&=V+sjo?<$C_$!J9>mR1|_
zXM4(>ZxWbi`v)WLih@~qe<-5~@0Zo(4T0@uXH*5VkrREz)Bt0p9z~q4it{2NBv`;y
z1CtkN3r0T8#eyQbW=AU|z@)mP3}zDj!=TDqjm)!bKLVkpiZQT#sGW|&|G{j9kq$=S
znF^jXAYME1<g0DLKzhPr^CAL9@7y6HpPaBT&V}?pXu=u?_KY?bpNK|<;(42dbyQN(
zP*_k!AJ&6_oLTV3*YB9HL_;vRm`?6O&m(&&{=O4dI+}rSnRr+T(P?;c!Ww{Qf(dKJ
zF7AeX@183DArscO_SBy+a`l|W=F?~}^pxl^m0GZ;WY7J8Ijb6WhU<zsYbJZin6rur
zd(K&%=p6PhYZ#rxt}atd0(^3CsIDb*RDA%(#hWP7S%)A2X%$Jdcc@NfD*_K#&so)q
z^`nZlkOt*q&bsphZzo7JX9XY-+P3hZ6J+5@Cm4!n0w<XI9=A=uw-aRj&z#_{e@iDg
zgZje>Iw&)oAWhgorGMHPxbcZ{n4Axq9IpxT%X-OB%pUB+rG!;Sfa-e;M?ruE5}&Z1
z?sUNzu<q$)s$2}$k1&`pT&H#)t~@dD$}Wnb2ZWnRt<h73c(SL);hCVP8rra}Iw5x=
zS`fz45g3bAm0L6+3f=*;=0yuK3apYv8AF4$&-iN#z_TD}PoodzTunUkE*RA%|1{j$
zr!p-sn&S7J=nHZ@c45*IGclHVnM<&G5fn*lGOB$jShAU5?vhOcyF|_u;ywITvcL9y
z?D}Ggz?w8SyeT<<$rgUI1SdzXGp(da8O6Cv4v>+*_GVNq!2Hgsic?L13(c7Yk&mrt
z74}8mvM>1+3UW63Z{3P{KbkVP>jZ2dg^q<<S&iqrISHnIPSQ<KhwY@IJGh<|Gb<OM
zC;XaRR%EO~J+zXJ#ZH$0mP0U^iiPuiVnNKca{wFm(KQ3h;Cc2DJIQU$z*ZQ|PC=&<
z=pfMfz69u?cDn+y2)v_!Lb`4kI|MHf5&kN@@&U18w(k}qF3K2S{}yjdUZM(9XfIK3
z<Lm|E?ZOOuu0o~S)6`p;Jwd%y*@fb5jY!7r64|s^-`ZCxARU&l^A(Uq;6ept5;#i%
zLkXOyfB=D`6fl55Du9B>dj;5Jz&_!Z{b+3GaBO?$*A)!&Szl}mzw7tcJj5P?afRQ*
z(BZb5!cFPeZ(Z8UUsH|3+`2ItN#XrHtddu9m*yRXoxF((+atWcXAFCu!uAaBPmW>n
zJtKlca(I7A47(eKCf}6s{$4R`xx)4e@9!PMj#Sv*;r*#GY^K7dhW8&5!}e6zL&E#}
z#IWz6_t{#X@cu(%SdKYj4-N0{8^bat5!*Ms|F9T#p28j$-hX%uyBk(!T^H5(2{y|b
zcnL@BPvu5ha#wd9iVgY~S9hSKAgLL-+!~R{VLT(B0{BEm6YAroITu?pR=62^>{D0|
zd+bxN|2_*l>Qk`SzCC5x3#>N-L<dq<;N3s?(vDzGOZmp6mch^Fyy&m7u`0k2OTW18
z+;pqt^YFVjh1aL!EFyPl!4O|v8TLK4phjNO2RMR2xkC5w*YH48pwXQgjiZfa8jb$c
zXdF{4(P(t2MyL2|uGMJts7AwuMr$;>RHJ+QYcAAi^r=Rt`fE<sXmqMZAL6e$T%*ye
z8r{cV^92vss1CYSqYw4hv}rW@Ripd*Yo5_)bgV`n=C65Np_>j5e~F%yCH|V_0^MdU
zgPBr%&TRl2_p*D%ovVSkbJRTuHD>ws8Ub#|$o9bwa|Bj_vH>pRb0XMaZcN8sG0tO&
z5T^xp18U3iSuEej#&9C!a~{D3pT<uK_9eKA;AaHkT0ZA?f}a!YL-1yTUl2TmU<JWW
zf~f?j5!_FZ!Pgl}@JoWd2wp;P8RT+?V7sz$Cf;K5IcE^piE{=gh_}@@L2@bXJWN^8
zgo#jg*J?g;R~Lg{BVU~vC~f?TZSd^Fc@GErjRy!KGWeWV34TqGHgh%+{DvS#oX`0u
zLGD1yhVBQ59Xrlh7(8-}t;VNhe5;9aR-ib)uC{tt5>B^j3-RFW&lN|m|4D4>H69l?
z^%S5G3t69i(RNN}62Ku>Qo#k1s<OrTJ5T?j!zS-Pfy+~W&E?RDxkv}-Ai)qUIPF#2
zc$kUNb)1O4hh?EM8XFq77`5MeOZRRR6POhpa){vhc_>FrF^$tG$x*1}yoQXfaUAR{
zZ7*lZ=9H5=Pw=62pIy&NkFe6cXu>&EusiJ7FHSgUjXVEs+RHgs)EKr547kQVg$nSP
z$N4$>5!&6P3Vy-@o((%Eu&X2F{xEi<8^`kPUa>DdY?P{eP9xn7%k>jry~sxArw|BQ
zog<1r5B-!{4Uk%01OqwWVAg9qj*KRd<<opC<l8*HU4pmi_0!MP`z5^Zx$a>e=l1~n
z#;V~{eQOD>;|Jw=@6$OJOk?%4|C1ki`5G(oo)zBJQ8*^QxhV}dORVPgt9tO!{OB10
zpZ&8PU0t7IOC}Hg^apSpYn@S#hZXKT+=~2r{&5XZh-X%zev{X+psUT@znQ#|w@PE}
zt!4RGYstrg!TN7{f`Jw3!1>NaR?-&h<!?eqx!exb-28)llX0rb_N#|o`S}UhE9o~D
zg!WvRd-9f)UjUV~PqehI1z(V6g)j0|rnLVSbS|?`NYK?f*|`BUF?rC7{Xp&X-`dqh
zNpr`H#jS?XR^)3~GR?TgkjLF^tM>ECqpaFjLTT%YF(m94-{4*Z<ti8q3x4UhBClGJ
z{uv8!KJ!(Q6<zELO}A>dR|c8vrDCHbSzU^loVq1>kgswAdL)gzC6K*_M#R9|ll_Vn
z`M^F0&Wa;ED-uMen2h@Y!89a@xCLN4Xw<?HHQBe{ukU5}pG?kJ+S|qd+J5CtdIs)e
zFb$1Uq>49kE(A-l5eLDnIv#OgLdJ<?2qzwOi;&iEbA{XFGqF{a&hdk1hI%raBFq$v
z&vVstD4+IL*b>e#kV;p%N>`4=?hs`W{lu-V^yrw3%yq?Az%iNN=Ak;aT|ya!IKdic
zMG7(kd1&A23S|u7Zvmt%v;X!Qmu>C$+Iwkx>c=iN^phb7D%Mdm9Gl@!2%3TE05VOd
zqFD@Z)gWXoXkCDop?V=z$B_{a6;XlGvcdqU7*Hzasi(!KSR|=R>8Mnw3dgADI9+L(
zswCL10$~3G-2)>H!U|AC+L)aRVJGV3+9?1#S?hRHjCLxdonW3&Mj2G9qDsZI(@fZC
zD1W&x&%G?W`c+}7yY1%J>D<-8VKK9tvc%boY9_E341+yY9K_KR%9OD}Shv7VS=}9g
zb*i+zge!Eno3=Xe!hZI{=tNlI1X@A*sTv}cqhJL}s>VJ2B@1>Kx=yYu=^U_wMLQJ3
z4l`+oLfT;*Y>>%c+Mya8?5bCU9hTec;5%Zp8smS-?iSREvj>fzz#c!+fAT<=>}3r0
z6D>yP5f;HS&WUI-nt_#gu<3f#DuqmDjRyeOQ_#z><?*y7Rfu`i<*+5S+=%f4J4%0&
zJ1IZJpR!&Sc3f#MfMfmt=}(i<!#~iU<T=<6@u$pI<xlhASl#@I_v7*-pYwKHZshf`
zqp>%QE%|3KnIQNu7Gc4kn*AS48ZzDoYyVR@(W?7xHM(bCF3v%4>4Ec*aqa<hzh0Fe
zhc_o-b_su-Rry?QcoQxa;p_uj6{)@fNYj=aV2V?D59X5r#_s#{dhBqwABK4_U))3L
zN)KHnS_wuv_g(BCm;97qj@Pv0r~GHG$63Z3Jg<m2d6D<lK7vd@+=Xjk7$XOqS-_F)
zH(x-$8172P83@42hrwT|03jhE<n-5`h<Vl-g+C~bjkmF!TGw&6oSBOKHe9Np?ry>b
zbxYfLrV-hgANeT1ZZ&V0jf}KvKdbwf?}j8CqT@nYC)<nmu~J0+@CF2@rL1j5N7#7H
zl&>9l&62O3cpb{uoGq1Guu@FVU(&>D8u>Z<EAP)=vVolQm#pVwWX`LVH)9gUHj3QS
z|K&@d3vWurozRJsofpvcar)$5m5YSobEFMx*T7uVE<f^`6?qQor5jNNlL*!|*MMBy
z?Qt5{p|-!K5|&-t+8e1>WG64H4AT`h`)jU588#paB2FXiFD|qcMBX2XtwCn(c0bzA
zEd&PlJ?sNuz>}!<b7fxQKXU^*^;q=Oq5J&Tw4ej=OuH;u0hc9n+om9ApTFi-w1J|Y
zs(3sOuEswDHw4X<>-}do{qhN1UE`ruMqm3$G>v+AZU2*Ohm%Bq-X3`ydfBI7U?4j8
zuS*WBZ|h|(%dbnWvFfnJ)r>~a$IaE9T&D0f{9&gxxD#i3SjJ>Gt#^9z(g5~}N}vcv
z@?ZJ>uNSH59{UkIqbcKei%l`uv++@PU4q`k{bZCQa3jPGK=3<P9!i~o-b>eG0vEow
zHP(YFZUMOfbi&8mH`>ebZsgM<A3T6={R~cRABIp|+f+4`mVW3CI_n;L-CY7Vo$t#8
zZm0b>i913#P3|P88hB|Zv0`mZ$3dD{q`)V>#9-3(#<}$xx4NpSHSsH`*M3W5XLo!f
z@i*&uf<wFfj}BVUIDdgPMWp)n`Cr5+{89OmNDP0({GW=6>U+-ru=Wp(*9Xu4ZocT|
zLEYy6XZ8I5{6X>h|1*2sjn4dm_BeR{`YwCSd`8*hk>|YZA@eZz{x)(tj47E%<E|U>
zkOb=moSbse@Wa;&51?D1^p9RI1d#tB5j65YW4-Xp7NP9D_6B%?STB4_zL!Zo@ds-Y
zcfEksLAUiny2p9}^MkQo_|AMU4I|_`^F0ouZ^`$rNAJOGKR(}c^X4+<%~c1j9|Fz7
zbSv#!;V^n0{?_%@|H^(){5#iMG+|t-Y+6gF%%WRXac&rn?uWJhoAuUr*(+^}u-D!8
z)$loOugI5?j*YSW$BOpGWb$fP;Q)LZDQ9$Q=<rC(M58>X=S2UV>mw~#{8dA2u0t-k
zF*TIBPpyM;j`rW#WYlk%hWXZnXLo}u;$iS{_liWG`5iFQn3;5UjCl?UEc>vMeh;Qj
zV@`r+e?H?9opHFFhjd4kdu|=r|Ad9m8^>^qU|ejWG6Hk;On2RzhV|Znb>fp+SofxL
zy(h+|JKnSCw_Nk4@|XJq#ddU)=!E<2SI`@Y_dn|7DV~gLAUt^cgW-(94!U^ip!Y_y
zl%nx_Ag^KUK*98GdmvZF_CVBr50VwN4^kWu>-+Dy4|4BDH8?-o3_aaGb*_s8>z#?+
z;LGA+IOA7VbMKD+NI|1Fp@WBKSvk%4ocKwo$F#(`HjW@+{e7@I>f0A+y7zz(_UBkM
z97KjmPAPH=3YLer(8pA!0Y?jn{e|iNH#pf~PAL;x3+M??V*~FQE$l1c0_cR}xFVLb
zn%-R~(7Ow<eFc7IkNYd%e_tVegXn^L>={s1jn^gL_<!c}jgIWQadP#rciue#-}=-)
z^3J*z?#Q+wNyS(fdDyxpVy=+_z9S&3IpuK>Jb<pSPX=qbc}vQ5)kOiU@6qG!|4{8z
z54&D4_eZuiHlT{j_fhigCHcO;ju7~cK-~dHf$t0JHD3~~D<j`S$hSf99XmSmI$Z+m
zRin?)4cSfA!+t87JU#N!+O6b^-u@cNDUHUx&Aw`Hb8F>rxeUcX_N{BBJoDic=!h)>
zf;?}vo!~2`p2J*u$Z=gg<yi>b+n3u%K&<KYTu;lz^lMx?eieC>5h+eI#2foF(Bwrv
z#77Hq*Hj_m+4q65Rr{X;fAHT}aQSOK#V9SPo6YFej8BJ*tXquh-2D8*p-re5?!?Zm
z_*6ow|JHU?1qSC5{6`na1S`(kd-;wF_XD;hXZq-}0p1kHCnJEOcj9t>hm~Z9jvk3U
z;(l^__!00{cD2G^qlwcq{I{Me*z+=Vnq)8b9R0Ot6ORUlS`An<^XH~5$!Gbj)3_^2
zac()E^||rZh!=d=l6pSBmPataQ4nnXRY0wCATUKaa%62+<>6A43?WJYicyZ)8PGG=
z6=WsfsAq@2_8&rw>r%1Ap{8~_8n9MNfx2$8PP;4>w=n&;t`OBWU_Y8}a0x33DQFlO
zK;ENaflI@u4blp7_8_LbBv80t@2<|c+fjR+5M<L#y-35gK@1WVAE~e05kCPAA$G)J
zY%7|K{cE-r`G^<mUth<@@G&|H;rtVajLPn^&k-R~(_piw`6qYnEwEgET@bz?Hh(#H
zaNFHp2QN}BpkqbsLGV{il#3={IhLe}FH7J)1fo?c<=bg5(bPQ^?Qk>z>AtbgkCFCt
zRzkQ~yv6}rucc@NUFO`4Q$)hTA%F}xyMdAM0{Grbhof^loNK@UyCm(!{Bx<+GxA0I
z?T`nbPI4|oZ>{?Sy<_gvth%|qp9G5lwm)Mx2yrgR&-n&@ash!zd-vYU5Ov~uub-&7
z(_V<7<@OK9=C{U-FL#`Pjj1EnwJ{FN1u5^_9E~2wa`~(^XuGxvAGGk-{1!c9)m;Y-
zF^W&KLoC9u#HNT9NzPbsksOgYz4wj8mf2-iWO(Y9JRFQ36n*@)rP80&Mi`<{9`Yz2
zs<4-~(+9~Ey+j{kC+#U<KYd{ThUOwRdRlB><Ck!SGbTd|D{?_W^yJI%RuE}Z<7<~5
zUz;E=^QqW4|05L=ADZBT1H;Q-`)9sHuEW{SSw3fYmxyovC;MkC9i8T!29W;LFm~>B
zbLTCXn=UMdSoy~G6MSj-R&0D!eJ5U<Q{Kj3ewXH!l(+C!jw(;#$ph~#DbL^ycdnn(
zQQil_b^LB_`y}Y*ET!+seSpYLIjWu$y^3=t!ydvIH?s%Ckh;LRl(Gss_+fGe9ML%!
zDB)i*U#9+bw~WU5BIdtv=jHxex8NuppHQ8N^L^!~)x);E1rhx<>rvG3)hRgaX5^o!
z^WExUt7Z9QoRT(t=}~@{EPtBi`#s7R8uVXyl#e&)J3Y#CWcl0V|G7u`F^V2{2pT@~
zDBr(L=yyEJKk+EvCd<EO`96>G)w29<w!g=t{4QC(nDn1|lrNO!LjDeq^6|3#Y0~fY
zD9=&!EdSV}{1`=lB+CEeQNDkx(Eo1S=4p7|qkNkzUyM%>HoWaozFL+~hC?^3@hHDb
zmJd4$<^T35Unt9u$F;47O&;arW%<|GHg9;{qdZ5JzfJw0@F+h<mOoAX9gp(;Z^ra*
z@hIOW%Z2{iJjz$ga-n~NNBLc{T<HI<NBKfoF7$8mC?7A&h5oG`<vFT-`ro4-<;SS@
zk3f05NBRCYr2q5M&a)on+hqCI?7!_E<*Q}6uz#aR`CYPH*#CWx@`bWo_}>PP^6|1<
z*#A|J@*G({nfgEOQGSdpA4dH@^eEr|dQAT<9_8C)xzPVjkMh;BT<HI#NBLc{T<E{k
zqkN$(7y7UDC?7A&h5j#kl;_Cu<5QvE-#yBYk>y`=eDC%s-~XD>U&#NQNBK5cF67_g
zQNCK1-_7~qDUb5IWcgx_?_D0{3uU<&KN~&D$IEgteqQq^&ynRq|9^OtA0x|!{<cT?
z{#RrAH+z(CljTDHtsdp8Wx3G*UmoRm$#S88n@9OVSuXTn=TSahmJ9t~@+i+y?e~Ix
zAMq$ZMzzoJ{gFrc{#S(lV*EVgQNB%<i}CZ8NBL@5F6_VBqx>#eF6{rFNBKfoF8pu3
zNBMYJF6{q`M|qAczq=Rod(5N!7+JoU`hVb2zW?Qz{+m6@x5;v${~I3Vt7W;+|DPV^
zcgb?0|2rP#3uU>`|7nl%@v>a#|AI$(4$DKO4G)`TxOdf0($yl*#GV7UB_?i^tNUgD
z6j^=E8J}a_!Sy~;(#}@3!k_q<&P8J@{69R7T(s*mRofmX=tFgyc8#VT<Uu=;v_(ih
z<qpa~)%GlnLEE!IFLqIOKC0V}?V=n<`U_ZF>`d%<Ldt%-mIm|}>h@Q<^fB4DLnfz0
z*S1%9&>wh7>0abPzdW9PlA_Pn9MS$7O+Uzkp7wNl>e}`|5Bf!#e&-`v|Cs#8#nbQj
zht$8H=1BcD{Ynq|?U2cN|5I(h6^i~!gZ{vaT7M7v<)lAJ*S04q`j<6F>aXbsdC*TJ
z{k^)jJ<vn`MVfx+-#zq?mwyM`f%`+*nj`hs^ea8+w?ihUr><?UQ1sVp4CoKMp!N5l
zUrzery0$$@(SNc}S(N%~`avG_6G^{O*R}_G$iGO_?|j%p|9JU#z#XW6KP?gU*Yqnr
z=(j^A=Y6;R6^j04d^|#r-viHU{XOWHlm1>^+n%K8&GENJ(+~2XpGf-H_#3F`uiPJN
zf03r&`H+YHq(4d5ws*iC4$@!Kuk@hb4w+2-6@8rk&uRTV=$Dh;&|lHV>96SrdC*TJ
zy`jINkJDe%@BEvG{-ihbhdWUJ+xN!occG?V=|R68GC3u>w!K2pUunet1J7#xJ?NK{
z{v=)7o}}na|68N!2YJv>Bz?^P1}b{f{);sI&Idj8Cw<KRJKzon>96TmdeCo&Os4*d
zK2HB<wEiCS%SmtOuju3S*Ytxt=qHli&|lHV>96T`{-1~bq&M`3J5c{CJB<F<^ea8+
zw?ii9BwgEHq3BKd5433gJ?NK{J|_PpMgQ_AvG&(!`avG_6G?xsu5Axg^yc_kr0I7)
z;GsY1W8-HB+~FYoHT_Bt`t6X()L+rZ>EEpN_n==+dP9FjAE&>jALKzlk@SZCiat(%
zO~3Q69{Q8s&>!wV{rTlTI(}cM=~sHtZ--3Iy}Gu&LeZP?<G>cJzX$zt(#PV*Bt>uf
z-x^In$b)_&>0|ykP|=(AU!>`G-tVD5>0|ca0e3h^e@(yAgMK??GWA#Var$r8`g_nX
zC%vJ+qL0&G(+~2XpGbN`e?=dszoy^$7Z3eOZ|Dzqp#IsrjsDm4D?RA9Lnf!Eu5GVS
z^q=gB(I42P_4lA(PWp|ywmnJFKlYy({TfX_$b)_&>BG9VJy6kKuOkBTmqnU>=Y1af
zlYY3aZSR0P9HhUdU+F==9Wt5vEBZM7H){Po=$Dh;&|lHV>96SrdC*TJy`jINkJDe%
z@4VMTf6^QJ!yTyqV;{xrf1##d=|R68GC5&g+g_pQuh$WQ`R@j;zX$zt(ht|Q?MaH>
z^uINlevk+KMAFCnZ=j+#?Y~IV@BFie{-lrDe+S&*ApJG{N)P(&kjd0v(Z}h(UhD5c
zznt`j{)#?Me@#EggMK3E4gD2;oc@}A=RF?!lituD?m+#ow~hYS^ea8+w?igpxUOxl
zQ1qt!2b#409`wseACrHQqW{E+wZBHw5AvX&NcxSswmneMo8xDZrr){BLx0l8#?KD8
z!$JCM`jsB^+aZ&wzoL)Rf1TFfgMK;b4gD2;oc@}AkO%!l(i{3K`Z)bH{mzvh`jg(!
zAMQZ?KlvbL{|hz!N)P(&kjdGoYuhUny%|3atkwE^&@U%_EPhN<^rru<(e#5n=qHjs
z=6?efy=nhNnttb>JoG1h%>Fyz4hQM4=~sHtZ--2#{)#?M|EIP79`wseZ|JY+<Mh|`
zgFNUblHSl?(Z}hp>381kp+D)VzdX;a9yS=YeWBhB4cNHbSdZH^sp8g<y#bDbJ3E8*
z9im>SC+-kMT&^UMcZfDaQo(^Al`yUp)~ovgU-1Hcc=yn_i#wUPxHAyq-~*C_cx58f
zy$I;9eV9k(;Y~y3osf;|0)XwCR^W3dkxllcU3_llEu1tf{46i}@RC62DBdc28Rz+X
z|87NEtS?$|bvFg4{U2MnuYcke9{hj6tNZ7pQ7ek;XM%ya?shH|$Oo9!SdDvNYg>=x
zH!gpe-<+Jmt8uxJNf~J)acyQmKCX<U=i@#}CIs^3ZUKeQz7#%XPX!0w$VlOzBPDx&
zqB7@A+_HEbHe<_ALj&GI7y=4;o-XcB;1+r(*csNT5#L1(eQNQgV*PLx#b5I=YFKq^
zGOF>!XH?Z!*uQH3A6tJ74e;Jbnf>e^@v-8~_C_Fh=io!!6~aAAe0EP#uEn{I&;A|c
zK&3yaN~epPMd7X_T<NUI;LRJn@%1)5_>Nb8RE=0GdZ$3+gXH+s?jiD<-&eB)H=wIU
z1=W(`u@pG<dZ&7|e+Og1x$^DZkih;Be<B<0G)lAGUrSTiJpu6Eo(;h8iz6xNl1;3l
zTtnSwK;M<&E7ouot@~9%h}DGZ;^q{4d;q^U40Z8QrYZ{^i|bo887-*sgYOsap}@G#
zavuEgM_wdsMZ5N!ckv>jv*#PwVmo_gqn(TgY*DixxPrR-ga&dD+<?lsz+zu~Knwyp
zZCc}9DfmFiE%+m@&q#&stc`R!+<Mt#2f@yY;u~AhTz+^H?qhujue@kEuyw&Kt2yYS
zn@1+&`pujH05ee!Uqiqvk&$5fmSC0~5@a<2O2<oNFQY^o?MJ^dBpP)XBud2}Q=&g@
zP!jF5|G~BolIUSLuf;p4%{_%cybp981R8D%G(3IFa95z6_A}7U-UY&VsYWTOjy0w7
zQL2`{@lvJ6q&gl_Ii>7Q+?v8j-rRol!9u1|NDSf*(ZU{kCXK5{zDND7qQ~~wO``rz
zdnoF+|Ec{R-k0I+DS6*A%NYF#FBo0nn=*Z&Gp*>vbY5~|baby+x&bnaqreEb_TPkX
zxX-h>pspm{KK&1nEjl_gKZ4Nc@cRQlv^!BHIywy>W53sm4$tHV_cN_VD9xXntic=b
zN%o}q<72lBNmTNCL4>dK#d!J9iuUR%#4WrrybgD61}j{Q3T9CC*IW#C_I4=k4CPy{
zblmp7@-o^hs88!M47+jiVPPU0m+<US<OeIiCCE0~cR*@qABF>O=!G{V!GHGvWph06
zv;CM}vaHBrJ5ssJij3VF8QEq<$8NPEt>SBclUl75ao;i$?6ks*ZC~XX_-C=^K89;u
zQTX`!#zAoSwIaUN4Z9Veg|o}y3JtV~^Aw7#XwbImtmNoeu&wNj26sWk&*q^3pGag`
zq%d7Pqk(i7wDMGNPm6Is-r$~3?)^1)T;mHb-s-EwEi-T-bu@rm?!m1tTvsmymyHaS
z=t4fXT1mR?0MX58L`m8lu=w7<KhJEWc}8>kg?(Bgm75}Co0^08w+m3YIcW3Q9Bh-1
zU3@^I7HOc7kGcl>DGIcK?!N^pTnXz-15FmUfP*bU{0P+u+EHJ0?54FnKm<cwbti%v
zWo?NR!eN3f(LgG``F(_a_iy1<tdW5l7QbW>lrTlgYmbPAqa!!vM?S&#0f{gAvg(Eu
zpk+9KP|V%~=Y}HeBtclsX#tKdzUR5wuBPwHX3;$~FeOZ5_YAV?4jX(sLfj&%TQ}r*
zvaK67h<ZH(QJRCo5RSOSr|6<%cUkM->tWRm6nzCSnpY2p*t8bc5UE2Ml;Jfx*^I89
z)EdQS1{YpoKX!-Gbtvj-UBP5beKHh&|L@%tCR0c@5eKfW!rx#_D22npUHF0tSW75<
zL2#G<*3rri!h_bg;j1;8?L}e~3Dd&{nMyd0d0~T-F{?Lnl1EF&u%)_TXW?OQrikEf
zvdWHQm8Do^X;yFj2CRN9S-pr90<1p8!@eP&6~2MH+5X^nX3yZ;wS~zBXjZ==E9M)J
zDlAz&MOJ5lmGV^Q6&6dnOb*KR@AXtY!%iL`QRu9`qO+`MnHD&pi`X&fJ&1G0Z18e>
zE=#i$ql?@!qX;JJ-|cB70Yc3l-5W-90UkEJz<#b?_426OspEQdZR*3aYawObke<{>
zJMfvXgX+)r0pVa<5}Vd$&)ML=`B-#mG`K@%!eSg_7^h<xcSb5dMVLd}8QGQ>33fym
zcUY7%I@XS&BpU3*H)!D}7gxcdA;+PN1E4Pi{nkh&KBYXi1@ujzZ_SI0T!UcHfqD!I
z97qTXyPxch>~Z97ux*KgfH4Q$>b_*&vyc(=^oQI4aRNwmfCz@sU0HA#86d7p7M+Ed
z7Dy*A<%$|^0?h6aaCB^Iq@L|-M00g2#HTV^!1kMT8JKloGDx?YlbK1mGzy=Y0FV44
zuQk<!U}(1j5!7+7AG1d_CWkFPe9>621Ai8G^!DE}A2LN#R%7HQ1*LJIUv%tiku+_>
z_7kEhcg2@Ow9e?r*CGWN_TgYBd3E-lztz=ksMJkFc(%L+J&-asjz>VK$5@oKAsT!Q
z{wRcs20N?^f-U81UF$Rl0&z+<2ZnZo3r$$f8m{nHxQrk(%-pA&bwp>f;OImJ8O?e1
z(#|ne9~4}h1DS5qx?Oreb08*Cb6}jV(;O(K2f#UmI|jE#UZf|UxZm}}KmVWWiRqxy
zp7?$>Jn?=64(W;Sq8HgdCLsKq9tbm>^6@VmhyXrx7C*`@{CZ4`+gYr^YKUl0SA*L)
zm)ma@H~O^<#Qt>%1u^!54syOT)8+i-1FoEB8l3qJ5cW5iBBY!-<gCV_lNM_xjqW3+
zA%-uh0u8KHsFV$zj-h$GYKR1NLrTEkjXO^wNXd3y2CeGlgsiDMbU0SB9t_s!#j&<b
z))O`B_J!ZVmn_#JD3I@l8NxQ?`_5mbZKTLqCf}Ye-xI{Ji3VHY?4ABw-)Bk)lQ)Y+
zW&?P}A{`?o6T;@;fhW;qbnJmBvO@&e$jkV5D}+JpCfr3qr!!$IQq3bBJQ1b3(>rcL
z*OVZf3zM}MkpcWPlN_QW4-jmmb4r)|&F3yZe~jM=2EP{aJJRLnCxe92YbQXMP)Y0R
z`&|dhFu4AaT%Xofa;89EP1}CC5SfXk^d8J)a8u4$TwkbZKE4PYlLlt6OkenYDVf*b
z21|=^EJ@NiHL?>Hta6!vBm78l1dkPUqIX<-sWN5_ZbhW+&2~ksVipanDl#W<D-K}A
z5wVH`jEYR_+=}>2N3(rGtYVr`k?A=<mnmd~gEZTjzj5{a%Z<?Ufv78Ux+ZMqix5gh
zA`eH?=57x1OX8SCs~+q97q`buKN!cx4n^Z~DCT08qgWz0uEOLhUWK2k(Jk7vXQ3NZ
zw+tnR{lAjpHV0Dag{t-T1xm%V=-5y3)the2!1q@SOHg!hTVKNzs;%c2qpdpC7J5FO
zZ?l-7!@*BQQYzx1n8c&OPm#ymS_Z+idD?%eninh2V_m@TznHbbX+T`agZSqulK28s
zc>DE;EB9%Xo3N+r`fr^_(X_`JK7>VBv;DT#H#S|_LL{T2@X9A*6~Empqp^yvS4IQL
zPz!^TJ)Rt=NZ*(ui?LR*2@7eP1L?xaK+_yZX2-{SB!<A^PSos-`taew5OcNaKYO}i
zF8S8LsiYes_of?miWw3jiqNJU&^~&qh}8_1XUDj4!0KRm9J!sRxwYq!m2igu#$9D!
zg3bKrkbLcZ<U7`mqCHl^Z(+Etw_LP!A}cIRpcMP&3vj8CEA9L5aWy;Hkfr&TkmW(R
zungSzvTs8z#7B45yl9H6*2EaAvkg}FkyVk)iZL5{kcTOuVe+2%oc?5;vBz~r&k)VG
zZ<Q<ZAfufS+xY@MB1Km1&|YRc_-aD4dy@sndtw}i7#uGq$G>#vh*%TH5%VNeqHiN4
zB9CM)85KDwapSltgq;*_o`@0|_vf<<MR_}iv8KQi<&py53hG)Rz;%ZesRSP^0hsLR
z*3>S9ju2|~%_{WX9{j3L`I;=|tF2;-;IWm`f^|dwa06UOuc_X;ReSPlGN4kNp_ryx
zE6$ZdT0>#~3Pp{K64}vWlfyEX!yS?Xgl@K9XPyBr^Dhvh1G7<VMsbCAN*uG9CbP?2
zX2F;^Szy%SijxHnyZ$7V;o_)n$TcR1LtG9$!9m#u+eXrW2*bO~@GetA#sfWOr(PqR
z`!uw2qilsFbwdW5jof#Ow%VWI)M|5~BdTKfuAB`Eej|-%3{aS*uuzI2o*h=*2n=Bu
zG}2%7uZzzF9?L#~XNQPr&|m2QCSyCc)byz4c#31lrCl_cj%T;)-UBNnT&Rh!;{F7#
zM3;ms31`LUo?HD-reS+<DNcrhHfJN*+yy~<0T)Jq3ImeDLc4&&>)A$&>*6*`oB4fI
zN|r!67AOR@?PrRqJv12=ZI$ptrL^cs^u#mRD8Ra`MJ4BY2kh-hZiLytLUuK>Os0@)
zh0=Ayo~^<-{aSZJ`>|eqaH|;v)R4vYS@1x3@qr$pzWAFQ>V>~OD|<JPZF`C!a7SPk
zM<AB<C}MKhA)eBYIP9e*;n~=r?m}qgq;XkH+A~mHOItu`ub{N#;arK<Xf_Vl<jO1g
zDh7Ad61JATCG+Dw*pEj_LEd|oyOuB)S?T1xuU7N^3=M!Ggu^gqk`Nt{X>@;wQ;!Nt
zDUqDBX$*_q35ErG5;|Ts(e05LF*!^fTNXf$>$Dv0W#Qm%`ctPK6}vGi>ZPjgYMsva
z5uHz1z(8ZF^4DGkySl32pW77g5mAdmbCBSxaIupm(sOsiMs%Cx7|`Ka#H3HZcKhkx
zKf1Owb+~UnbogtH9^C7sEinj$E%A)AW#^V)yRW?zf0ML9nQXZR6<ZEicQ*C~pS9AE
z!U62pmatbVlgU8YsY7o~?q2Xz_2PD4<ze{BxRiCAj?T)%H~9h;`kkv))ZGDs6-Hu-
z8{2MsgW-4I&B`CTS%uSy_T<>U6~vQ+7nCm@aE)~1U9OQ%r3Px~`Dvt^!`evasHiXI
zo=SWnvvy_#4n!72y@2%kK6-l4^B8K15fC~8L^Lu3eXIYLfoMn@bgF4kBQG}sH^qYB
z|2Ix}C`ZcXWv<}nrok<h5d0Zhh5_HX0fescRp_RZ*2yMSyp!>daSwhqrX^Jtx{-P)
zoDc!mEeI4E(OJjSLQ8sYhnfPYYb<kfCx&JpEF?PuCeqtH;l-^!kqHVPo`QIwtc6$s
z|L^9KN<4^#5MV@iY2bh%NI;voNYW$-0<^9U%a>g327#%6a7{ee@Qu+Sn0Pw8TP9=(
z%OVK4Q=<q1!X1=jF4?_`I!B(Ki7qkD^n#uEUZh|(%3zRM7SdH@X8ytN-L}(FuKlt%
z+y1Cpckd^{-6UzCnEwnkUE{$RZMH$XuL87xcWM7=(qik&P3)Dg4ZF(qwec~AKQkD<
zMTV1IhSR{XIaq@V`r$l-_h_&Ndx4uDr9Ucd9h|dhmEk7XAgyL(=5}IqOlvABqM)#H
zHdIXCQhO5!!Eq@Ww;vM?E{%?<t&aw)b2e1=mZeGI;L;=%To9}&-&y^%AgIKKmoQ9%
z1|nlNKZ<tIRs`(A!D={itiBp5XhR;Ngo8D4<?4u~#((SahFZ4@b*rsq(Y)IFNbt58
zM@&Aqx$EuK{DM}wh+-O#asBcbVe1@wWtDV-Lqk7DN4|Tf>jdT)|5Z8c{_h*L6L9V(
zdntaTu-H&0D!if+U!>=)+vSj6m8q&+iOh<%dm)%>=ewBg&e831JHcgYMSmy!cT}<L
z$P65Wr6T%cwIb>?`%=KD{V*5o1@K$waSDv>c)jqG&FHlsCj<B78)v>>-O$ygMq0N7
zC`BYdy+i`kJ3#`Jrk72eQvL;Vs4S=Sx329o3>(~dJ!~**v9|ru7%U7!Fdo+H_*yCP
z8sX_$r*J&sC^;L-IWCWgu{6`X(PVkF%W|Wz49Ud-C0Sd`)QL4yNRCqxu$1roZj>9a
z9=pSp+033!WCQ<Pqytu|lu`|O!AcGJ5iwTNW30^B*PE=aby;zZrxv~9O>W7Xu`j$k
zpx1bjvCku2hK1cz8@;*%tqA=44yP)iK7xUB0NI2nk#i{wB5NIiMHW)0nqNxWW8=O!
z65JIHKA$-8#^x^D)iHkRm2POe@pji_ry1RRMH%$G`Ij&mkUr;nB?{wniof>Xs*BV@
zi>Do?yHZRsd1B2V{amICZgp`65-SmPd1#VuMf<nXkZRo!Uf;)<cHL@7hmtWL<B0D&
zbQXO}1>o9q3tgRmRqyI-CWg^@(D~OGOj75?Vt#-uay2P}GsYy^wVivwx!aN)|CFXW
zl!#FzHcP6s;8r*}k_k9_OnDm5lKt+Mxf+YQ?s6?);U5A`!ZhK(<xbTl7<5bkm_JBH
zDids<FIkG*rPKeUTw2yNg}5?_NsI}II>HIL!4QYX#3WgWqzMuuS&HT%S;9QTO)PZw
zArPYd6y~&h=sy#4WN?VBFCj36y`E!$0!D?V@N6Xd;%u3(__nMK6(_QJMJUS$i~Sav
zWEnOBOZ153hWGe#P+G8E+~U~AQnl6Ruc?+6+lm&^1!GN{jAG+vzki$VC(E|oz8We`
zcv=vdAmboBHI27=L_o~4;_#AGta5C3m6HZ6k2&46UWVvMWd|ZZeuJIn;_>%L7Qd|~
z!lfmbnX_XxIZs3?+rSH!3vdAiOYz;+0z)yG$4PDFy~S{7zu2?F;2@cR7<s+_5dcDE
zFmjy)=1*2T3GSj>DnvRA%x~Z|@V7&(<|f7{v8f>n26u?b$Sn}Fix5{108|p|pi@EB
z%>s(`z1>(BQ&z^XUd)b!*w7P)&wARH`Ijh?ykA7DZO1vBBulNt*K?9gxvi}8AnV9&
z1y>B16ed=yFVNT`NPikT0O@s`8-vo}h&shM|2mADJ~%~}%a_eE#M?q<k=wo)b?Htp
z=zd1JS1L7KouZ5HTxS1q2On~Qg}Qp>YFn3yDA`9ge@To@w;|%M`3h9RE|R6rU8$?l
z`&l|f_-mh)1UT30t==e|k|?|(b}mk1Yo;=S;+mP*7%<S00uhe7*VGvkgpgAZg!wkL
zsXKh|$`L~CF;+BX&aYepn>obCvtZ!sL(0JIXLVPdQM0?0n7kmR7(T9JWQlA*?e40Z
zJK%+Y)@+Y~yJ9GEh;P+B*KL;RT`6`5WjZP6sa%~VfWzmz#gO3%rrZ%;8Iju54VerD
z(1@V1TM&U|7xSOFqb_JqY(V}*0YiB<p%RguoC%f4&f~C{lDmDXk`#W`_U&6s9hN)$
z_EjQbv(H>iy*(2KPOH#zU57S#{fFY{-KDnJ!d+|-*=q6IZFfnc#qQE*PBdY;>1iXH
zb(FuUWOp8vzD+}Wi{A-ay<wak6My7V*FQ303O#Tyq4-ypYw;iYUfh-Uy?Uu@oISUC
zavwnM``k{u{JXfPQZoF~w<}}ZZ;x>|4~0IO0q(zZxp#o>TRT>8SF-;k#{C>`?oW{W
z`7ZZ6BzFv4;WFK>(8SQO3p5^XfUoLA&=s!Q90-7;^x@cLo=$yhMsHB#;|GZI!Ch)k
z9V_NtN#Vu2de8sRYkbBHK(RNh&XD!K8b{1`wk!zw!3VayJJ#}Uw7eTFD`w6n*CZW!
z8=x8SM61OeAhA0TGfAv_M3LTvM0`Z1j~QR@=E%`LVvg8fJ`%SUu9%L_c<VYH#%6ps
zBg6-Ikz|w<W7La`Zg&~|R9&}n$7%<64t|Cq6mMRSd65UbSQm5Jz`?eNIw0$El%v@0
z`R}1Z61y)N#cnyZd8`yd5Eikly+lvAI$7y<P%hG@wTjK=Yh5;XkPTtUW}xS>_=1Q2
zcbago7$fuIm<Z=n1j0W=6UFAMS*|AOWJCCWtx0dmaI|aTvsI6!BN@UZFM1EDp2s$I
z!E6zOajj3*0tFIY-jP`oZ&8^Q;_jbM+!fz-vl|M3N{?p1;k}z_P|*fYDbY#bDpI=M
zp+oiSA@N%s^=k4pj-q?TDknM^ish(LapjRSWTCNcGO|$~z>F%y5^ktS{o9Qq?2j{4
z$V73;r*5LEWC2ea5XqI8feX2oI$Qy?Q=nZXoUGQZ8I?`PaS8;yuwt^L{DLLgm~Uji
zIQ5XrMO6XorATY+FsZd&SlA_&Qh<?pO4538W{-Rq+p(1he!)&L6EVR#4qezTG7P4}
zTpV>u%V4tCSJa>nFY-!*i1bjeybo@KJaI;$uCymmAinpNBAH%x3S=GJ#9GE8wquAW
zKUTr}!r{fQ`KovdyA^zqM!aTbLGnn5bUJ>np8;OWoQLVj%+REbGF*Cfie!@fT9C9a
zRvY;kf1IA(du)1t{}Pu64*B$SaIgRY!A&jqI++m7PX#3$G}xiG@u`Wrxp_fw_kzE&
z*v($uLGfRk#hg|ioc_JcVv;NnB>#<BOm2e(xBnl^!oJns@>_iB>U(&LNm49G{u{HH
z+>Q|3{(mrwOg3c_vS`}(GK)!)FG&6yvzXkzDxm;B)+|Q6mZ!l#D&}zdO&oLN&bTz!
zNJ#7a4c$r#6>&~<sa3==2?l~7&QxOJjXy|k|CV@T@@k#h-BcAnF5ckkos4E?N0fuR
zXLK**1}06UrUWr_V$R|u{ZUf*z22Csxii5vyL;teTswC$nbwcvAW+ofViSc(IAWlx
zT%{{-sjk+q;DH63<gulCN$mAz{fd9@Z7JXJBl%R<OGtpt)%V3gs|*N~%M@7ex{ge%
znO|Y1It-rx(B3Jhb~{o}hG+>%wjig0I5)d6=z2~89W9_fC^edJgtIi**0WKaNg0>z
zy77a-C4cbyN3r4;lbGxiDHmf-ZT_22^Kg%)J3T(lvm+0m8#?$?LBYs+^^u@G&xu5u
zCxA1m+~o5_D6W&wmI+8cuY=pk<g=eJ7gV)z>!g9b5<gQSk>QpVl$L2uRjb=TuUG>a
zMgzZL1APu|;8yPjbo+0|7(cwgmE640aSa(ih92!E`I+C&*pFh}$Vsob!%aCrq+@CD
z)9S^ajtcz(f6GHd@pnP!Z2Y}Cbh>{nRZlry$x}DvP>2Bk6n3&NDN>?1=U_;)iTLqT
zMQe(d)mRd&`;fK2_8wgZ(+_%0*p%mskI#2iH*NaVc&Pp)oKdLmYy>fxRr1a^GM{Gq
zsn=Z=^I|McMm^o{tH`3lWl^o#_dhxI)1~zqBCFZBStAr|FG3rmDshS}+t~Mx+x3UW
z+BnH*V;bA=>o(d4k@DaM4%H2~o)$Oue}<24VC-`XQN)6?p&DQeOZ;MkzQc=56x;qM
zs~Mhf{HwP8B1%i@b^AW@k+C2uLKl`13wldcOGInYdWJChx|G39v?-4s4wDw<$U3aj
z?W8PJ(iZ;pIB4HbJ4JgM`D_sa#a+skAN7N3>hMCU+CdRd$LdO{N?oeK{K5sHbI3@6
z{<W<T;I@zypl--0v<XU~g#FJ+dTfjZNn0bxahtLd4x%HV(#a^2kx{EH#TCn3>C0EM
zZHgAbW|cHNKXaVPEnp6v*HFGyiwd_MBS*qklB~mE<00?Kz=<W8z*?dkc;<rLGiCrX
zt}Mh(0KsVWDoBkjl$T^-v;AwZQCAD^tZ)r~3TkP?SK_{py=<}`9miwwA&8@eRYnWT
zx4C=@VtjsP@EJ=!g)X1h4PAbPAfklu$B0iihyx@((j{J|h~?Z#i*B`|<CHwuH`$Ax
z&e^CgOvUEft<vlZs<A55PCQbv(q>-Zul*MqmMjD5SnP;oVN4t%j9ap?Lq$}=r{<%h
z@EmD}-$nAklg}~FrXq_FSKnf^y56V&jvs~!Jo@rV8J!uT{N-*ZWE%RH;jWQ=r1n5(
znUYY2MB$)LSRu70qbdt@1KF|Q^7rfA2F%{Mhz&eGQM)+9lCqW@UhSC>t~^F-emeg&
z($3ek_~32kL^ttp7lc}>tHs6p%owL*3^lvPg45}mQ@fZx-OYiw5!u}6<nonn3L~zK
z6{}k=s>!9nuIj~Iqe4gEZ+WOM{w@gh!r!aocY&nH`ZNC+Ik+XsOZVzL00wE%eeUYw
zuQ^2x2aQ_buMyw30`n=93@YcBNwe1t83G;Mpi(>zo+{5Zosaa0Fg&?xkMLxFt-6Iu
zrb}Qj-i%e@7oHb468pU&1``gTQZ~$U``R4Q+i{u7-ZDPca+z)!X8Z=pA;u!f!!h#H
zj857_@>>n^Y)x*AG-S9{pb_IJZj=UQatrrb?U}zv{E-n97B8GB&{0{3BKTq=m>>DL
zo+2b0BK$qBDbcDa0R!e4_yRrMQZ!KI-d%4)n{C+XXc0Rd?Bh0^!nXNB`~)D4tA(n<
zC*ymlzzWycVntrE>&JkptX3uXh)W7oMS(coC;P#09)3fFzxG&!dZ2L0fZx=lKkdRf
zx)Uzf0hdF{%?TGGA_r$o_-c;ZjpmTOZ8VI~b+yVToXN5mNJ$e5%JnfyGn(8$%FQn2
zNs3bYaCzuLilD(!g4O&OD>E<`k<|?@tF5vhph~lS$w%}gL7WpKz5q4!)G~_1!(8I!
znz*j8OxQq9UO(BXI}U6`Z)hDWIeBY2<v4q3wmZ(i+#P3~=qC{++r_mNir3n%*<uP#
zlQtLaq7Nh(b79e7x7Z`DTf{nw-a1M(-tA)w?YK2&Uxkb8dZeEp-&K{QJvv4^*f7yf
z(jIHj>SOg-?~r=yb*_t;9sM$?cZ`j-{IoW4G*HIfJqBayV8P%izg0|lo>VtvGiY7=
zHQ=CBnWzd++&*E_!~P%$2KouM*rMiaDSY5?^c|uJwv$ta#6&b5=zfZr6BDr=|2Qvb
zW1)oaB9w^!b31_}1|6r;JY%%HaIL2)=CYlawHr9=WEWtp$<cBFTE!QGtb%<Io;d1i
zV|>gGxr%%uE}sR8kM54Eg}>*=Xw5YiuMgTIUD^v2ZM2@vN&&bG$8AV)dJ<KO@{})U
zX{R0v4sNHMG)j;CPZT}<I`0MgHB2ZDO3ou&C7c`~54+4xGMT+`rDjGENM%LEnm%S2
z@ij5{poQvs#6rVlfo~;gwW1tT=rA#Wk-0BHE3%o`FjE~Mw_LC%u}W8AH2IKfvFSE7
zK3!Kx6TuT+6YB&nBsR@-9l`8`&4uWM7MiXLKWa}04QzznBunN11H9ZzooUh6Q>H8K
zbwdW?vZUK5zbT9jwgXyxV?sMCCX_j+rh$ij4usMnqDj%lRN^rIWQM0oJxKi-I#5${
zohJK3%|mqqMQ_nRPNpGqHqH;q{%E%Q3(Jb{5t8FL<yWd1d^<`${)x`IAuq#bu8I#`
z;WpHcd((6j)r#;8_HmUMtC-^O4+@%7oEorVZGv}kJL^}R5LL#}FQ#7(RM7LpgY2@S
z4E-LAjXL@fv!Oc_N(rO6_e8HM=1)v_jd!-eypGHpWVb<-E=R2hz@gdh=y1s^W8~)>
z<hPJ~p(MvLrpq~82_l1uyT7&yG&GhQW0+#YbaFSW8KtVrVv$dc5_vAFMv3zv>fyT?
zSiSuh1`B*qgFIVO15~r-w=LikcH#9&=I4WYgo6hJUm1I%!2?*WKPK5F81?wf5F1r~
zCB^>EK^-5nv)n<2ZyR$@s+c~h`-R%u%l$Pk#k#l}tzJ0FU$Z72CBxY}({yY+_X@*F
zdyjOT)VbGn4VhRn`LwaIi#d-GNqFJo$eP`iyo^z5Y7nz?m1+euySXLfT#n9PgwF`*
zfj3k)rGpy3sLWrp#I>7hl=IbK_4(iqh|z(`c&kW2X$ySNhzTf0yck?F<r5?_0VM%~
zp!hZy{|GcLCS;&wbz_2tc4BCDAOYQK6vH{>M9r)2dV08UR3-C1;rCa)>CW4WOWa{&
z?sy$d)tAJq+#V3LhWF}uyB!N+SRuqKu7dd)ZDj7RnTg@no>UKXCvLrNb>n%ua1$IT
zFhG+?yTz!G-vrAmkI*a!iS7$_wu|rP(Dkp5ufITjV;siyoqF4gljOql4X)CwmQIyQ
z^S%Q1fkqmh^)M_h+`3%|qv8!eXXPs3`gvsR4lQ9ib%C3Q7w<@_JXT}-2&|md@Bv1y
zJ=nK7!On^a;_Wo%Q7pIJb?_rt=ptbHBM(WhYJ>4)rIeSa=)v%2fg$pj`Jhm%OtnkE
z6;Efy9MJ}k?6otRP6RnNREYgCB#4~z$`yNVF4YZLVK(uHSQBD53Ld5=1#lMqE<n-$
zN1@;2Y(>|E-(w#Po}r8&;1UUgnJm9a!W;m$yeJLL={~$#?6Q>pBN}bC(@hdIaiN4I
z8EzIGI^DmgX`kgkwFy@#A{(riKMLbg;UOC|dc!{dxF)pG0h5c$&#-dKL6rsiSyX;z
zpZ}Wm-m*NnQ~H1qKN=WEv14cFg>d_ZZl8X`a<jpz3pS}Jg$;6F_fPj6f2m>3@4@4}
zUI+8oc8S4*-V5tj@<XDXXb{tlEg!r2*b@8@m=1~gln<!0wK>o~_L>%Z?HhYdjlCwj
zuY4f`_<pnfXG_{gY-QQW5DMS86UBnK<XIR3L+n}qlSf$owXax7_`lWvFQ5NkO(*-n
z80*ab{3ey^#UJ`AlX=$+VkWiYv$Y@k@Wo@jDvzQ&EJM&|9>l@A1@q_m;l-aG9xLL6
zsEY=MaVeBEC9MkUe^TTbIP#V&A;{ZfIKIIsxVt?$JV#vF^HpWrmj{*9_$>AJka{z2
z`uXsEL27gkPvPeQcMC}kae%^>BX%75{WxgE$@?Lup+UsQ`nQ_4#FsvAz$pI4g8d^e
ziHK%j557=Zv=y(`0^Q;dQA?c@%td#952gC+CcamH$CN^+CX1eGkv-J{tulpHYUh4A
zy2)Fsdr%@=>_=#|tmpUA>KD3+AFfqucdbTU%Dz%sZN4~>R?ng2$7;0!`<dUPuV(5d
zez;bByKA-R5<{y!xrwy;3?)BStKCS;zK2$K=q7%+R%zX}x<6=Wb&O!l>lJD0qgVKH
zRd1I(T+8qyTrwr0ng|_?QZ_CYBY<28Tbzbr-r4h}Z2RuO4~iv?7-U;9<oZ)#1XTIK
z3x3UK-H<2q44qFFd=XF)D4&t<m?Z|dkv}jn_L`2M%BTth^&d97x6gNDe+AXm--$Gk
zK%`LP!kx$8#G?V$t}Dch=@zoB$-gk>GMNfQ#4b@AORpFPZs8_;Pj$2XTS_Gk2*rYw
z3*FA)h6W`UqP#ApUErVyqbCzWCId42&Snh(HrxNcNW6;-XFNJlr2tEI*F#Tyb$V?T
zbnG_0HwSO4#`l2WOV~x}UjOM$kzhSjkG=j=gZ04Ik!r91|FQQjfK^pz+VI-@+;cmZ
z1V|9c4kQ{RgmZEuL@oh|LIp`hYO8e)NzMtRh9vHDf(OJJR9aJtP`~#7>r7|rICjQf
zzOPI>?I_<^v09-WzYg~6ZF)sMW}2#PQ*Ar8b(;Tq-?jGToFrgJ`~Uw;-AeXeYrX4T
z?|N_RU6;MjJ5Tk8>z@FYt_Q5r_rDZ=N7n<C^)TS*4^_U$D!vDqTafA0JVs#jr?d$@
zLJ9CiN*nRqA8tIg2ZIE?BYxDUeY6(eNBQ8Pt`Fj)z=!YA@n-1p$NV?196$AM&p+{-
z@c5}GUa4O4#qrbH2@U@acY6?vZv5gCujqKyaQur;JYTJye)g4dd}Z^4+UdI{>VRat
z>%)h-K79C|a`feR=GgAJ5>>!PHE`l{82FYvF@DO0ZX9i(GBpB$jdy+I(3TI+SbN#m
zv4=F?_4PwrK63n9V@tnD*bjS)o_Hk)6c)9IH|obf`H4?lMupG*#?Gcrpb<3vm2vt|
z<}4V-zq#b;fv1~bU*lh0^7QeY{r&yplf({vd{VK4df)ZTp)L0u|I%k4{D4hEv5SWB
zu2YA)P7R!x{Q1<BhS&4~JAThS_t>z$JlLlWbv<?SiKhs#>z?th6WE9X|LE)UCrEoo
zFURwE*CWtB{JWjr_+C|44_(6>-yvwZ9fHPRhK_u%K=bepX4m%+$u<C&oOD<L?+Fxb
z`QFP=9B3R@eK~wX*Y`+Qiq?_dyyJ<NR6R_hyRbrZ;!(0>-Vo<W3hh<led@V54+P?f
zFt0_}2n@z1Eb@4q087Xu?FCEv%=CBG16>$pu*kQeO`Ph!0k?_X4mL4b5k}01e2Ujj
zc3R;kJzM^To(&7;3*Y$;oAm@jU#_G`P^W-^g<g#4uE&@6)N!xz?gr=ZR0ILBT23YE
zHWPK15T?Vek95*)^*`V?@zhF19nPX*vXGUh#UQEu9@#2M?QuxWI>>b3#khu&@yEW6
z3DL=eXYdzT#mR%`@I=?igA?j$<H>`Qsu+*2IFABgR0+93=P}|uRyvR5Wfg3_^B8v?
z+tuUAgYb-eZ9M!TEN#j8?xS<4o$+tIyz!GK4`LCUuagJgZasA!qo<8`ad!lND|;QJ
zcD#kE9d13mmTs#*fZN3VXbB%TpnhW|g1JRY?y+rh`ifPST2!)@|8Ax#bD<@VPy;_+
ziV)@TT_~UU8tKD0=9dx^f3Vy_N*}jm8XoHe4fK@;&=8U|tR;kV@@R<Q0}UjAjkNdk
zA!0k0PJ(zOw&Rg<iS2l-#3QjCj|Luz?Rc!mBe5NiaXePa$96mt+mYLeM`Al3H{y}l
zj>m31f)ILqJ06Mec--;$!_>c*NTb~S13^~*h-uuv@1rxv(MRgvk~PfZJ-AJ@u0UlM
zIDP`)@IeyhaVbaaUq1Wt#*ZnGL*3Y;v2aJ^n%MIn=S85aT?M4Vueum#8=qA*ND$x2
zgJ4sA1PxEUkg;0>yX-2mwR7K|v9){ia4A!_t5R!&c@VDFW_smTEuCUT3LhFJYyJXw
zb$q8}-}6utsOV2|oA^~Lu@5nh)K7ln#O${9b1`aI`nh;D;P*fmWhZa3tAV&-*{cBB
zB@U<}c$AY9n<RT@D!QL#hsJIGDnd~0cBhQ{inUwEqvm@mo%;43*|##;w;1(pzU<p8
z?!NtBTW9RsRRsU1&_mfbC@}>>o!o&=<Y8@Hz4}A1xiiKuU|R`(;^f3jauHFL7f7SP
zson}69{(#zY9IE9A%>l}O$@g%(`-`7O}uc8O{&{XJVo$7Yq#9QJ%mh=MASU_eK)nV
z@<mAbW4TtZDhcl5Hr1kgFi|~z1hEi4i2!HdaAD~eZ3>wyUjmH7l^J`D+F!tdCT0?x
zS>d5jIy4Cd#cNKaptnN;$G<B%v#EnQvjw+_?ag+(><ehi>)9<fXX;VIQqZpv{5POC
zQi$u^<cm+b$-P7#%9MhXcG-ZX)eH2;ruF9#!SP>8TK^n7IFRCB;x_T|rs=fSIkdWs
z>6-+<#HMwtTZGl$BT}-~>r?D&Ui_|`Jj30!-+aPdG@~SJ%C9HN=g@F`xb^?EGvz<R
zZQ^H%>6G8R+2-;h7neUmln+CDB$pj~Fo_}1tg~N#?ezQzB0IiB(z8v{b3M_MoPnP2
zUTxFk=41=e^Fx@rq{m_nbM5orIS(muS^zAF7wz2|>o}BJ6y&9S)>Bf-<5x<G{}MZ7
zkp0JSoA~b-=^&MFD`Uep4(mth#RPwqgug|?|26gSb_YJP@i1m)QVpYz0gc=mnWV#e
z<V}D48LCbT<`@fisYcic!+&oZDmBb`cYeVpgwub0l6a84#9i-OpXYkCt!dZmJAsi)
zHMq`cFnsshq(ip>|MAOA;u%nlVS^@#&u%<C=PdWF^@nt`)^2Js+o2Dc(S8d$NY33|
z<<qjtr`%PpfTMsJDmoq|9?CkrN?!y-6F_u^-kxJrl)XnmA7W_g5x=R(HL$(FvtFVe
zevdc7Ofu7PTCd0YFU2{w)L6ndLp;atk`jBol_eYli4*_WNQ_n3I4E*qrZ1An$#QBr
zR1uVb@ci#n-3W=>Ls%Tjhn|drE!ZQp24J~A$ZH8(Pgz6)M0#S_mXiz8E&p79UaU|8
z@^koy`oD%U$gaDwI=R!aj^F+&_vV5d+^ZT0HTI?u{h%={u3{;9RT1w^*~xeENpQ4%
za^h1APg0=8Xo3x8*CWM+N2V1TS?Ouy%mqLQ?7lT|a;a>TvCxNH@1iHgj{`3nQPRf8
zUg0xwpPS^{hbH)Tco)=q^Z3CLzVZ44hCCy@q-!j^WXsX;lI;(Km)vwLyd-@hykz(3
z@RHFp;U%|Ey8Ok#ji`zr<2rdTg5!%U@skHD)#J&7B~~VaYijY(K}iOy;4zM|4;p~M
zOoDDAOOYvO9BzG;)DAp)_%iAqMgCksTW8$Z3s7r`?r`h>R<QrTuv`q3OcIUYGGm+(
z(3X{=0_Vh;b%dMgV%*fq$%FOyTZ#UhJlKFIn!Ij2+yyl{7)NrWjINPt=hR_>Mo7<G
zJ$$eesV~sC$%y;`G%?PMxa%=y$KJVcmrFtL#GgRG(z*RB;kO9J=BXn1<iu^usq@Uy
zjfW2tVl<;z$?Q1Xs>{B!f=>L|5_a|tK*mxa&6Bl3O5-vjef5tpOPe?gvgBZ~+lED0
z@gb^(DvVotibonbhYNt@#~_-%%OgQfJ^}fYc%-$Df6Mu8N&4=TC9?jLqDUSt-gxM9
zq%v42Xt5Rwt^!6|TO;Ii#vcnZ*3Zegmzs^ek=jZVHPY6T2N$R(^qpHf#FJ~~B});>
zgZ=eH&0UYt%^r_9);=`n>{T=%O9;uYS1x6SSea6%Q1j$L_&%jmh;-dL)sCd2Q=OcW
zv&KV*c_s=*Ql@mSf6p34bRS~sHy%2ySSo)24~s&-fF0yHMEEpp^3kVD+2OPO#eYjE
zF_zE-DN9i@7I|8q$AKBr+XQt(9nXSq0^lSdCaLIO*)frSL*Pn>;P83|D)H;!G@LN@
z0|cZz?&m-bE658aQs;bpvne!qAaI=m_yWlWO-ZGG8jK(B`UQ6I#AxltL%wgWmq8GX
zeurB-CBYK(w`l2wN-XYxOxmR+XDn0CDthcjJI8N-#^uTA1Eh$2z74icJ-!fCC;o{A
zDOb$++iJjhH9Hu^vr&9B9L3Be-#SIq;np7|T(rLn6@L6I(Bb=Z$8obi#qu1v9v(@z
zOrQ+11^Uerz2r1LxRs^u{%D=mU9{Eugd2Mab@y!>53AvmgOKkMz~R<!yWu_xJ*FUp
zzH<Iyg$^q(2!!p4!`}ja;v+<qVnQLoaVD5TkCPMUAQAK(gLeM#z90EEgX813zeuC|
zIK(u*<=f5{^Dl@G9J*{dLt8c6^_oj;A@AKZ@mICbZu&09Wp!u?>luf9XF8b;hei=W
zq2BfvH;w;#9Uob1b9eREAc1w^$7G>s1>$FQ0y45-(g;%AsnP8dtF2a^!o#L<j03)X
zvK6edo;fu7BE7MPpZvwU9&eScZ6+4C{yWA4TRZO|r1;TLso)hHCF2Wb;0~=cM(|*b
z#du)}EEzx5KR!CqKfe83KZ3Xd4GjN%7mB(jai`*!a6g0lFE$?Df-fAt$RE3u{OSS8
zg>$vSwc($Dr(bsc{N+*nnC9<}-wnRtCv}g~ZTv?r0`9J*i8Ow8IK3EMdUSbjI+IOi
zcOM@??&GHzJlKz=vWYIXjOR!$<4+!^5}tmd=GCC;3opjk0rL+h?nU#oI=YTjCW?ez
zl2s*_QpvxOi1D+z)Hue43LY0m--h3@B@%CZGF%Pcvihx8)QIDgz|2ied=VtjH-0Zg
zQ5%Y2yyO3T{H=bg{6ST&RwO=p*FQ~7tU;?2|BTl9!_~wDA+|q;;YQF;j6P12^NlNi
zw`u%0;rINUo)3E)Hy`%WDQdCtSvZ=KJKu8Ht7FAHgnt67<r?xngL@EZjkZ;akq~kv
zJ#@|Ccj~Xcye=Fmrjjx|MLj4EaTXy6H$JhagldQDPjM-ZUmo3rw}{?yc*C8qzPvsh
zskkrP_?1237oP8LwCLS5ergkbH1Dnn?<V+spy~7i&#e=iR(|Ey2<k+=_W^{^MPK;r
z#+Cmpn?*8YmLj?7@W}7N@0|+|x9RXcZS(jyREOUiMpl;k;r&yb;zZX!gT8$NY1OwU
z_WU0v9{sa`<nJ?IzwT}8#;53KtT!L7`&CDc*uL@bGXLc@V%w(iUu_!y#^xpT%cH@K
zPyD-Q;t0kt{1$cANPO4o&*t&xH;?~n129Zg;Ai}v#P=&-7=0er6qG+-yzWodC7(R%
z^#5g-GSlzFZanOJD;X@53_FIA#IJsQ160?6PkaqoS0RgpNOS=6lO$1fY4`&7&h${)
z9Ox67y_sx!x9A&9^@!1-n}>#P9Wv6UIc#!P7d>s4*`l{6GtigJWX*x0ogE!LBO~dd
zl;|)-UCL1Z_ZJDXS6neXI+z-|G;0h4!nid(uya>7WAqOUrj5D`po>KEhHIMpMtc*9
zuH7Tqy;rBrq4eOUJzZUU_Mm9R2pY(y&F;2ja#zn_|F(D%x$Pt7@a|#C?z0OLRM6U!
zN+Gwud)GEfUelJ{WsYXD?bM?r8fhNsF}qv^iKdakp1$<1;Xzc(G$j+~D{MMnVe|P4
zTh3S5dcMN8^A)Z*Ut#<C3Rliv*qltn-KEW$^etTieCrt;Ob;gehfSlu2jWYKRHo<c
z$=-C&ZshGwo1)ts-f8yiHZs{BGn-Bsy?c%AnY0Og5F65a2Kv&*HKV(G(`E<d7>3Fl
z>e-z}P6jz!(z}Pvy~Y)T!+kd!*Y;%7M*Y<r_bp-gzK;HmfsWyhogMo+dOEV1TT;8k
zvOc4KuxF=Xrf(S?Fw+^M$J{x(J3W*Ybve;Tif0TBXARJvNgFq(_Zt0B8#GlUmJQ5S
zbJ_5$B|B#=>7TWvXV#K^vz1&wFgR$WbLqZOFd&QZ2OY>l*M|g`u)-QWFp8m}o{TXt
zWDJjHM@F;m(oO4K`P(xvQKPSCaG=-h$qoz;Ip!yeh=;)EWJ~!P*<qu1U?@fRzF`yg
zni9Rkqxl(qW*RtwA9VDf2(%cxQll9c`vCztq;n|16Z8beKbRcqP-;ZzGkb^ncA3LN
z1N+i~wTCbc_d<F0Fva4^fuVuSF6h*)1KC}c7ZG*4MyaG{&{*Hy*w~0R)BXJ*)2Q1i
zZZ!wMv>~IHl?kN7nGuoKZoh^UG@G{kU%n$tZb|lmLs@gQFKY~qLK37wd%(Pus*vx{
zSpV{vw|N+yOj!alMusy3lIg@A<^Kqkqo1QgOe6DYc!=1S92lBjPCZH+3IKdo^i0E3
z2#2`=T?N)^+&DbKblp^Bz>ZkJhLks)YhOB(A;DbHy<HaFI5cde()~T7gITJOGBWV@
z$mt#3Y3v4Y&rX;hOENP!yb};G2{;KdZi8Hg(pRv46e23XHUPmugV{uHRIM1-c5S_O
zL)W#*^_#a|adq<Qu6Hz1!^2A3^I_Lt>3|Uwl!tACp_&StgXQ(@N{}*a*|^WJ8%(q|
z$EmV3SzD&a$w;y}3&~rI)G%uI4ev^ua5e_HKm+;NJZq(#T?P+d#whWpNXlV`QK^Nv
zpJ-0r%wt~?utV8R*I=kiv?P;m&zorLR(|-(VRM}WmPjVEX3tQD9C2dB=unTj*U3os
zZ|c5U4WHL+gLhA{Gfs|V%}v`@B)ht<+184dhK4bkr-%9m29w#IoybhK52QfIKz6SK
z)|BWOytQX<2G~c_$q_T18t8*9b#-+Oc6DvLDv7^)u8SwH<H2~^==*y6iB6F3SkXT^
z)R!GL!J_=$s7`iur6fJ;^65!5!3p*yl?`_HZ%b}Vy2A%cK8d(9sM+Q5B;<kryZZ+$
zIIDkcGHhx~rL!0q6LAcfqr|wLK{<9x_?_u&a(7QInc39?wk3D-SeG10n@Q=SIvlS{
z(aKfntQ;U^ZGjl=HP;8Ifu;=E6zr3SRWv2Np~R?$88~ow6^!i##w?&=yU7hVwYIT_
zXQc1Vq(|UhG1B&-v_V=D+^yT6!GR8?;XdIGQa!02qZcNRw&Bz0a{Lv0y_tdX;9U5W
zSu2#Y;8Q@dSyy|zd(4~F7^H@UzQHsJqW}-F&oy2t*c&C<n;1BczUg>LOECuIRY|3L
z2D6ff%oXVgmtlP2acT3WZP#_Vho;><0~jNB_3TM2@2G}Xa+FeIG8is1M&01xDF0FZ
zMb>kaNe%}r;GS%c6iT*7VsI09G}FmlJ36fHx2XGWbvM<0NZm)&eNf&bs(wV(kEr?)
zRX?KYM^ycYsvlAHyH)*eRli%+?^gA@RsC*NzgyMsR`pM-^3$sPv?@Psl`DLw6~5D|
zozpi<`bSjzBPxGH!H+2Thplqe&WLJfM71-b+L=`4ld61Dl}}pb3h$)CJE_{6RCL5u
zJ8_jCSMYHKA6NL}3V&STk1PDgRQWMgeoU1gv&t2|V+!9f)y^^1{!w+0sC&1%pMIC(
zlS)sjbX=v6sq|5G@3!DAICYPx`{aPc=ieglwaexGN1NrnTivbvxJv(}y5FJh#|z*r
z_}T8?yE?x;ZB2gq%xanLRrlHO6y~p1@S@-<7$p)dJfTj~u(=bn9h{q%*CSW-^23SP
zIxCCs1@+Y^)6}Hq?8!oOkqr`$D3^F93h?D2M`9NI2|1`uLoY1+J7&ROfG^SHMugs4
z^^d9gBNn}dgsF4^K6@%}4Yw<KZePd+ryWr-Gaurr{iDfFZmGLR-aA$L%%8~gq`Dg_
z|HMw2&z11!sKPU*%6BL{-70-r>3JcZqkZ{!PFr|X{bTAb4(G$QtNx5A{38l(ht*!M
zgfDE*NGUv>k8cq6HMy1^$L?;FOzQk7r$y(d)m=FUVaT+2q*GSiq3)+SUqdD>9(wT4
zTksD_AvzTv>u%$dc&+y7eh0wa|E5I#e@4MIFw#fU$0SDW4vdsMRWp!@;Xj$ZtVraS
zkMt!m6b|>vWi1*q73gf$^U5>n-GHl)=jw6|m}fQ^^hE0uSW=L98vDdRYS|h94y2MP
z8gOe?8*$E4VUV59?Ao2ga+T>oAZH*YlLd6yWJ@GB){Wl8jXL!6L>vn#uI@U01enzi
z7aVbjVjZ>}6+KLY<3gJ4m}y#Bp@?u(_Z926UzxoA+D+GWS^Z90cxQ*dwrkx6w4(TZ
zVfbxZH>|tjYMwpP)R-2M21kF7_PSnYeWDlh&nvcGbImj&T>O;5?(B@W`s;$5k?{&H
zHIPy9?sRQt8O;C(y-8*3rD`l8S<^Ku99avXta2<cnZtW$YOD|fO5K+ooK_L-Vx}=;
z5j9m%MnQj)W6};@T2Y-%V<A{xs#Q0f)TvZj>0p5Z=GT*pDOL--(nA)-%6cueq44L!
z6=>~DEvR}}(&!ybC)2|zNk4T3Mq%qE<i`LR3xHA-b@%P+8QPgP6uex6a?zvk+SQmv
zrUY9}%jEg(GaN6KCBgsIunc33aVtG1Y3f25ilG;*w~87uF}WNVa#cG|dwa8K)gYS7
z>{FFyXs1wa(xPvyOWg$8W`(zXjdA7n&6`y(Y<M^<?x7`NYq4q|!|Ej2=HhE1{Iq5*
z_ooU9;S2k4#pbScZat#OAFoQJp%AkY<LF%;d;xNDuF`8c{FwoMW(Fzx*>1>R&SG8`
z-(7H6I>!nvR#<7t0&~igF$x_T!4y878r*9MU^@N+>ak)+4*s>lDaY#c(?yTO>#P&>
z(;{@w9;~gfEh%bK=^T@$+Eex{dv2S(^x<weS39!bw7%==yVYH*A8vmv33J%*8X@d{
z=g~1!1*-u?x2<oI@<N_ECU8D@ha_oY)P{m*{ZZ>2q!NbAud&FO4K9mam7WxPHtAqz
z#8=Qkw*o78Y|FR<GnS0opQ-YKeiI^PY590&fOERcgkBKdwy?Z@<<Y?jj&&U5GvRgM
z9Ha^kt9+@XLMjJNqh1A0B+~|CTi11OTeo?M#edSv4mDO9^)zq}AimtTaZ8fczn$^A
z0FENYJg^Jn+kTD0nAiVV@mk=jUp(;55N2ELLZ5{w`qtuA|Ld7&R?8Ly9V+9?O23hJ
zLi|SHL;nVb?J!pLRz|7T5W&qTxPoxbTBBtBWZRN(Zug;ZI$1N|rCo9-7Bf?~nptDD
zacN!OrIx`;yb?~CpPcNlXXEkD)&2}{t_Xzy&ySCsfNWX9_3cU{exuEPIOQZqA+*$J
zsJ=N-j76@(&mo4>F^Mc|)YtVbQIHD%;BY40alPbB0amCUV9x{R8Y|f!lROLYQJ$+0
zJE;#$!FlkyW5t4gN&MI5)9V@o^67BTPeDl5D$QhvNAki+xMxWZ%kwqxO~K_UL>@cx
z@!4>*vLhcJ#8{!F47@D*NWbI;2AmxQ9e`Z6rWYY%h9|P;hwB?PfmQ)3BbNQ139qqi
zjUg=p0sW2UwG>5K`Y;nbQ4F0iXs@aP!&!FJ9HN1Rw{@}mB}F+)drtTPk0!SR<eE8C
zu)40#QUqC__2B%B$m`!M@QNX;v1ag^@Z0RT-U3&0n~wkbb=O|A>6)u*a4ds_b0cO5
zHxcJqV6xo|c0(zfaZT%2N4}EhYu5~NuvE8DA@ca(lA9ZT2E~{rCkH-HF)U8Z2H)2+
zL?ItFY+zpub2CiLXla2qmE<}OjAA-nuT>#ryQ3pm{7EZ?a45A*!{)v%W+k^->rK;)
z3$!y$GiHas)x=RM$qT6)%#keZZX37ah)HGyN2F*8q=wbQSW4Ww3G+?dEc<rro6}t|
za(ai<xerTWC0#Srw|x}ZaiO+Z`gXp0Gy68#o^n>sav^{*99hPVLCFo5M+(ia+1be+
z%nDaPvV_Z~2M6twFUX?4Q6AWz!mJtxF-A=~RDenKfE#}%okn<=rW0};n1lBN%95El
zi(Uyw+PsFhfO~uTZoU<vDD4x<6@j{@l&EXLzl^c{x+_H8JC@b$URIY{R=2UEZc9hq
zHlywaWubJNT)Xkp|L~_TeQO`yt#z#vQ^Ix0=<bb)!(3?j@r{b_Xwx-`vo%<u#a|wC
z7{8TI=YDA0x&xcxq#b*4@P#vIX$m!#t&zu<)OI~D<8n5w^13rs2W-4fyf@>N+=g>S
z4Ko4Sa88_;%wUo}6D*Nn!?P>fGmyS@W=I=On$nDbuI?K&HVG(`*tp@^<hD&Wbh(-1
zuvKxvMTd-~9lBC-<<SOI4-P$&{j-)ke9Z%~>ocE7l6l2W`!0UFA@b|9FJnGiFm|~+
zDzpl5x65t5y9(Xq4!;>7uO4!YP?1<%BsQ$OZXG*<o=o4sKvL--&#9$X$_()tBb>Ia
zTZYXxqi$JKJcB(FMv6bP*0ZW9N1O*os}gj)z;;zwz~m7VUIPPxdu`x?^?1JpgCRT+
zoe4$oF}*)CwMU|J>d3R3bfyd)9ZWgecfs}<kEN^<jpU;3U*(n%&%P<xtH)8%)wIJy
z9DpOn1e=ceA!~`X=>opOdAsWMpc-%AU~Pk;T*W>8IK4?z`wJj161M{Hm2@<kd9BWH
z;GCM&!jaLz3?eu1pD6GC$#%R75fPKL>4r-en%0T?F_oq~x&lHISIbBK8+ARqAMfbe
ze5I~w&G;M@UpijX8Fm2#72Z>{*@x?D-_@d1v}4b&Q~1z^Xw2-wRsxQe;K{o0-7V96
z$I+*uR8BMw?P(ks>L2C@Gd<YY(?<sp-v9)Xuvz{#A>YHhllcHPZ*)BvDO$b=r9lZ1
zRQHIwSE{?A?)B;(SNC?lgA?j5>qAB=E$eGS*4KoruL)US6S6+`Q5jwrvc9hB>q6Go
zRefF6*HwL8)z9zU@1G*^7N`&-=~SrJsk?O}c=uI|#7y-MD4+|~F9#fWFzq^}a^mVv
zFw{0P3-?NuPjUtgBPwa%UQth{?Hu`F)wk|`Wc%>X()R*z@vzFU?p8gbh3dW5%UzG!
ze65#N&$9oc&&isW{eSx}@-e+p_8(Vx;~$jx-S%JqAG7}~BI(BSy#8T%iVMNWbB-5+
z+BrDf+cU^sM?S&}4P-fuNDm8CL)rLwaaiOQzEtvhkF_TL|9E`Y&HF!^4m*nUN7Lmz
z)&Jw@^&iTPU)S;UKcv0AZo>X>cJaDN`-AH5>!VKT*@a#^a7q5PfZDQnEx;CcnlP|k
z5T0W3--BigHw~H>#BJfd(5n@0X1o?yha0a2I*;TFR!}t0^a`qr%-M=<%4P&`@p`uM
z9!nTA0$>v5?#+xassppUE$iyTc_8d=oCn5D$ZTMWmb@2qT@*~qx7(do;%+U!kY7hj
z_TwGYv}hy_7iRS2=YWT=YyNq@&{g$!)>oTdeLpXZ&4=?sso2(ex9mdiBsq}wf$BHk
zjh63#|K9ILlPKLYidrmxh$LP&Ptmcufu`Y+fn@({pJybuUE6h)>m7AlK>s!88J3~^
zam1c#?ThA_e>5F-T-YB?m)&F3ABUj%J>}hh2)Ffy1mSPE5MCcue?T|zx(WNknd|E&
z?GI{JuZz0hV+wSEx6oaI;RR<^P$OGfuLaoR-)or;CX>T`JLShPMC}E;S1aS4WM+3$
zJeeBUlk9J6N#auwx8mC+yOMjFWIV3M`rn7t*D|{Vb+2_YcVR%sG+z^}!~YAB02U>0
zxFDrnt(no?yUk>>xv33&f9+KUs7m{^F!O?)nE4w^7n*z01|EW#8K<3Yt(*y9kKB3A
zQ;C`2Xma+tC&9CREXwNPY!g;T56%N&OaDADmeG+W&jMEP;VO%?{MoHT(6oG;jJy)5
zJ?^Zo7Cx+!Y>iLT6E~R1pC<qQGwJ*tsQlCxBt2q5ekM}%?Ld#Ho0k71ax~=A{u*7@
z{U`ifM|*vIS^0iE5WKY)XXP(e`PTlKmEWoItvyRCpMGGT@RStb->34eJv$5j|5o{-
z0{FjE`6}C{|5=r<j$hdMKUMkk9xYu+@>PrtPol@#pSSXtsC;YB)XHCp43DT)wVm50
zhOf2vOKHEAu4`1jOga0hNycMAS*1ek0mD4vVkbjAZ3JUHqGDS9H$b>YES#4AQ83IS
z=1j{^f-xRZJuUx-oR44IDY!0zA}j*@WK0$uRSNJv<a@-t0%#$=&-uo*{3Zz8BQBYi
z|1{@YyUb2;X(_?iI2r2cGn`*FE&penzi3+ib!CKS{<Qq}asJ|I`G3#(mrl$7Dd%4_
zEx$NS_${q+iZADUR4TY=A3YXv>~u_(-wS<+MO<=MhSLT4#v+b<$5j4m$SW3c<T$2S
zcpUtXMGS=%XA4{8<G+E=v53RZF|<e50QjavD>KyJZ&H55#ZT4lZRk=g;_z)ub)C)=
z)~mYKxkd10KRt&_4vdY}i{R9OvHBoCN<b>J&K=}6j)Zm4z)SeK4T;3!G}V1Q?OFJ(
zb|)1-rnUF5gm}H}ov|8H_YuX<Y3-52#`SvJ>$V)0L?_NDA<On6$Yp*S-bW-ysl9du
zOV4zX{O*C|3dI+s%(>(E6Y^PpoT^eC%aTv1_AS3Lru+uMydJsLD>*4r#fZ}HX?zSq
zffY@!r@apswwI?T)O*Fp{oLH^mRsTZm|6Amfj6eToYF&y4R)stY#Kl5^Xa&}%Y%Y!
zmOPtxv3%zInCm&}dItP*0FcHJJU!Fp1-$(Xc>5Xf%TP{wMDX-X7s1oBLC=7<p8?;1
zyelam@bpX<!Q0P(x1RyO6y@ZB3EmnR2;P1My!{Lqf~SjM=-HrWz}U~gc^UFbC?D|j
zOc%l1&w#g|0gqP?@!@(2PtSA_y!{M#`x)@0!z3EQZ|O6^+s}ZvpWD2MX#l&z%V!Ne
z7flK80<V-oL7BHG$Wg)$%@Cd{LwM&=FDNF01SV_jG-pos2#KpiOrn8K*upXV^O6@p
zn%d-c7fL1a&d?S`3_G++v{h;2rt_B5+gfM0b(D~z9kPXKZOzaIjnX@`dbCk*x8W5=
zUK^f*{^2}eUb#jm;HLGzM$zPDf9gF?KWW6?p*4Vp2Ac*SNoX=^r{5@=G$^8!OGI=h
zAQSwHAkc`zH=d71a0E0)Kx4$F>Ad|8dqpGacozi4B<^vN$9Z_Li1Fvk`uFAYQEFob
z8PklWP>+5;+(B^yzONgqYV`pqoin}vMb53BkmfA#Gfo`mJ)qL<Dm|$tTvqvMmA1+a
z@Sm;@{3F?0aRtHCnddm!CN6@fbVR{ZIr(VP5x&%&;N1Bn1G=a<j(^>*G>r*oR6V*;
zy%ATs0qIItn*8yH6+M(q>3Ub1{O=?9M;Dcof89t4T$Hx_Cuh9*=S>d0$$>XH@CI?<
z`Nj>`{kA65DeqM$fc7o9AN=hTRm&*<*YtZSZ(eV5;7ty^$$>XH@FoY|<iLLs4(PP8
zMGHM){5u^MTlQKV8}o?g!{REWCuvc1>MoBs5f+tpIpE%abnPB(irSwN_Xeo_s=aE$
zL=))xGNjeQYz@-AW%!8~U8l(q!+*6}MD}@g@h3<>_i{j-3X75)mZ4BS62Tj?=fWuW
zdqfiD|1u>wZ3>+*PMrb#x#gn76n>NlQ4aX0!=l#o@COY%{(FTu8y0~aRvHyaR6k(q
z5}xWHZMWAg+pEHYB~9r(I{s2Q+GG6NP!4>xIVNAy|4s0BT!>DpASAydIc#AfJ#}U(
z0C?~A4A(|E)_+hR_<Xc_HL`>81-l@R2ax9aD6h)-rmB(hR@m+Hy4=W)mzW=@KgAsF
zV|s&DI?{*ouY;Zt=pl59o=5kgwYMWlI)eTj*z41I_3U33mWT(-1ef2S%E7<)?3MiW
z{!64pytGcNv-L&xZ%Y+m`I70G@q^)vf8i|nb;0=0Zq3)%$*_b!FTGemn1NnD(L1pf
z{R^Ofs{Mz{xIY1;xxBEwH^4qFtp5-QT{ZYOqyGDK@y<N`_Y?V!{tJ(|N$Ee?hlyvx
zNBWL*Am?Xz%98r6%2gWrE%A|lyX%uae|x(A0)L2>XjngCUsbsv>!Y9aQP!Wr$|Zgf
zLe`JmDRk-~+avThatIrWB%dHwTNagxNG=qz?Sbs`PG6pVM&Bp(b1HZz+kf<Z(C&BO
zo!pWm?9bkFm+%53^oi^zO8s}}n}MD!irtp}N6e6-@8_^b_?z2+-=YuxsTTG`{Pg&-
z;{f`j`v4C8n+lFediHrjM2{$egN1*4Pe$5@q^CMxf1yu#_!l*z{{g4}us??$@?T+Z
z#0==vzo|2zkNuZL5Bsl(Ll5`^`f5#o0li?4MKAD2ZF<2!r03>>zQ`W+zV*3k?_BsN
zO7q3=z5w;lPyGWwsedeA;`aon*<Hx~{=YxITxh(SZhvZluMRzg{ss4fyOm$+<P9I#
zUu`zX=ELI;>1$5Y&d~5E{WZ!%Qv`377y9qotbYoYApJk;lm9*gd$sg!X*LM+nOw_X
z0qAXn{Rp-n57`gKyQOB3i<rmIk0|V!D4uFR!ul4&_%cTHoWOVreOPMx{RGR)rst|L
z{&+pqzFs8D?eVq-nY9>i-Qx%388u6IyqSty_<bcOEc{=g@vPKK_<V#9>8hNU=`B_E
z9yQCnr6`|@pg#1)!mm^L(@akpDTv~iH)@7`#EvOL!pCxDW%vo#F!u-bV;GM)9#Q?P
z&6Sl>`(@NV*`FD~E&%GE<@(-OE*!Su(VtjG?lXx62FtfL7p9R~gjM||S&a85lASz=
z<6J}iz97(z0Rk7LO)m*ZUW*or;+zOIXKwU}FJk=sD)^l%@<RS6SU!MXY8Lrv=p>p%
zDuVv|F|gyJ{==Wt=0q7RMTB)pKk<j~mryzQxi}YKeJZ9p@Tavo9~?gTP4JK}+7E2s
z4xh9h{*>%H=M_7wa?;OzKHf56{UrYY{xN?k-U7We-gg6@`w0J2YX;aq1gRYL^;}Tw
zAO(YYOaD>A<t#7Aqc&UW=sWz|2I%uqf~DOH;42*eqZn^kK9v%GZ8k8)X=)GhsoR6`
z2;*6R`UiXmGG&t8KgN#LgJh3kfpw{MY>zZP;IlHsUlED(_*k2bkbS;ocKeG?QGKhN
z>~D+|fZ`qESEfYbt2BTQ{x=iky<|b@a~sg#GRJ<O*(#Q1BXInaYx!#*LDrX`(8que
z*T#Dx|44-7OaBCwn<YWD#ewohDrflv31TePr&3#=o?Z@ke+k#6a?+=LP(SY&@8xnD
z_-*@+Erh=Kxqr0RA-+;g`r)_rgT`yfdyLBHuS?z~Zh1qU9C?RW-lbH|@-A`stK@I<
z(~>_Kypn&d8R7axR(&(d>0(L)zED4=>jSuj@yXGLQprB050Q*t@|NTe{V4ReB;N@2
zN6FWKKV<SmC&sI-!T^8n`t_o<g3^e$iC<$^ef27~r>}&CFGF>4B@FhD{)`F`Dkx3-
z4u-xH{_^9jq^GJmy{qQI&$9hPpBLwfLSJh6;@@zZ_+?6d3Y#B6hd<Cinh(&PAJh9&
zwr`*wyF+{p{)OasKl)>pll*ef2k5V*&v)jyvxGp|tBeOUY7hAD%t>={%B6g)`mtO&
zkvcZ@s)Vo27Ek?#{Jm9wzgK`+bOqTTygSJA36zIfKi(Z?|HbLAQGd(0f9y}#o=aHO
z1<Mb8$rTrq`LTZje<HacIb5Xwjr9%jS}`fy$hGJ{$2a)JUZlhRdo_Xatj;Vcp;L-X
z-&eOHK8muvOr7~9`;*B2NGanxcSnf)%X=YDwv8aH>Ku0VbJ91<@w-}FRRv9;{?dFG
z{HO7k&II`9g35E@NL8-Pofh*^pJfXe>i>L<SGEMWKg3L26dycA{y6GGg)ytYgrDIh
zeu_`a?Eb3yB(bUIP#<Z~AB|KZgXsTP4f2=tN0q<4Yp<e)$}xYrs|wEe6|jub6G-pH
zNP+>C(~Ce)Wl5Rg!e13D4U&LZh)-=5Rk83C8C;m@I})slm2kRP(yLX)%8G4xl22u@
zgydOD^nl*l%HlAk%P0+ds>+4wNUI3afnC^JE#%MgA^k%As$7)g2W}7c1b#$qen5VX
zKKz2?=~zrm(Qq@>tn@V|^{tr3bI7aqK9yGbi~jP^&in^GwTt9<CWt=RM~vn>B(f8d
zo?7&u+PC~8+J~-@KK_d4)81Gmr2xO`Ztm}b4^tB5lfTdyAI1|__yuu}HxRD{N&h$=
zdIs$`N&6Hk{;1zqQUVV|D6k>EkjEo-c;w&UZ-QnJ0AdWKK8hb`ea1inS*)*y8D!D1
zKO_FD`ALZSKZC!G<iZX=(Er+O$<+V+#AT~3doT2-wOKK>ZQYhft@2n-%9r^Cc^YO!
ze!xolQ}A;tO!A&O@gC+c`Qy0DAG_q~A^ULBuy>}Xn5<`w)m|`%gG6F%El~PMzk<2a
zl5+=xtC%I64i`WCiOaqqv@!O7i_KsZn$I$0|4|F``qV$JV0`4CYenQIfzKaj8ByAl
z!wHWOt~7@<oR-~VA42JNN{>CxS^#=^J%<!p$%DodJi~uG^JTaHuF4fT{^Tt5HHUZy
zreXEB(4SZ3Vs7|p{(OF}2q&i0Mf_9pxBP!BH;20F_;<6M_1kJMk}EB;{T1^+bOHZg
z<oJKUSLG@=T}u9e>c2Z$Fdn$#0Ui%H{1MK05DKBDYdk1IKuu}*bI2F^fe9AbANa}R
zfs7}j{xUTll$Ft`a_$e02c=~^KG1p}<Q3UpF6Rg5und7TjR!32*~SA5_wMl^mW#!z
z2|xJ<&=c7g&hzIOPb%F08uZ5U=vDsOiuciexZLvRcnRRHy(+%2{C5c2o9Dj~?YjMU
z&=)g^9vUB>BL9W>ndD3Kp?_$f*DFV^{ROZ8Rq17+7{L?Nxvjw}J;L&eSpHr2ak|Lz
z?|NyG4Nv?IhA^Q}{v7?U4HiWx&HkSKd$fq`J!JcLB|q{%?B8ST-=l<I(x><Xdk4O#
z8Its=`6B2qQvAbw5$!FK@l%NWCE6qZ&;Hc*|4P4X|L@W_=wqakmsh2Jk^e`33BT?C
zE15I!=UBf1J&r#gVSip5lltbP{=k37GMIx<ee&<{&-MGt%JThpne_j*|1JUm`){x1
zzq!8czl$9IJ)^%a<M}p^U+}kedoW+cdKAaU;Ah<)pm(JYWH|hh_Jw%hKqktrj_vJi
z8I5<+-W+{V<D2xqh?k3aex>|5<e|#deA7K1Kt91-K>1_vgvPgu@_c_=A^hZT$tKA^
z1Tp?ge~mQi!~c?I3Z5f#c>v9mzaR6GJTRVw<ah%4pxlC&^J7Z;18jeU1pKhd86V`0
z<Eglmzb5_7VSfOss9W(0=A(?C<jefj{j|Se@z*e;?D~0pJ#_^5OG^nQjY<OZL2ss1
z`4fZZ3$z}_{)N}Ws=@!T!+(|sFRw`XKps5*-QjL87?%FLljkcK&v^9Yyl0M}y^xGQ
zgLZp<_8`38IemRGmMbE1I@NfM@ty*h$)x4~W4S6+AG^aYH|OS!4_FT-b?8+452zn=
z=EIN&;stUCxS-WYTk*ezKaiUTZ#P*D0O&uhe`(^VHC`eO{D{EE)-ZnXzut_L*l+z{
zz8}bio%*nkV5VHg^W;Nee@nAPXn1mMjHuxCmkQQj)?uu_)S5-yp8(01*27H17Xan>
z2<<I3D`FK$lg|^UX#QPRBn5^vruy|}tc=R}V*F(hg2m{g+KW{XJl9A0Wy=5ad~)GJ
zjt@?df|0#HkN6h-otnC1%}A%RCpBNP_=Wz@Wn5GR>vOLDvweY{m`&eCqOVL1sDS77
zhL}@dt~c-mjDrNk*XVBv;pcRuk?4(}f1OM(;?JNNE$7&W+6TXaW|`9-><94;>;WvN
z^)HsUjF+RtU)tY-G#{}03w?mTV21m4tAD6ZrVCxL`tPa_e6d`l>~|Cal7B*bfm}Jr
z<=?Ml{i!u$dHPA~g(M1|zrx<J-opJO^2wg-%_1JpI1PC&HDlQLQT><vD6`WjC;8>`
zBkb@O@R3}M=Qnac5%YmuK&B<V-bgNj4&kEsh4~}Luc-fCwhzcZK=h8N{X?wxV}QYR
z?heE=&~Gyqw&nj6rOTr#Lg4vk5bHmVe5gDcP!5XAmzrgcd|AJXcz!J9&HXEbZMftu
z^_ls@@(xGYS5pS)r}@TdE|l{PTfa|3ehbT3zaOy5q2IwwSo%Y<PuOpsKfw5a`aIti
z>!ClekIOztOIwpNUMBs!FQ66-P@m|zucXxOFU13Dekbjj(ysX$@aNfYuVDJ2t++t{
zq%0?S^L4GjdK5{6F0xP3R}MF2JPCPN{-ccj$6DKeggG4+-Frpwfgh0opY(_qCBG&q
zIm`Ze;r{!RdzJL+Kmz!s@53E*^LiZQ2YnG_P8|P29{1<GYsr~LD>THj)#3xD*Gm}I
z8b`vU@Av0?YstN0KR8JGbe~#3UwcBX*W91;`^Y!0^<x-8xxQb=#0nQwiR69M!uK4e
z*}!)x_TNAGiBDWMMdKy>?-w%xA_o_|5d72qxggUMm}}7!4A5A$_AKZD{`+$QF2{Z#
zxA#!a7kmKR!FqO>^=n+n^=9J#uVjDOK4|?1^!Vh@zrnzf%V{|}I%(w%?Q$huN`qeV
zM`svB%C~B-+V7z*fZrDV%rD^A%@rK)SnHvW?gO{p4p?FumFLQ3{6|WO^arp%L;PMF
zk^H|8>sb`8Grz&Vqh`<xipSQTk^Dq^JfUR%bGh8FS$pQ^%#T>E&BOaU<WI0)@&Pjx
zq*!Kc8Nvdzhw}y;uXxma3-~DVw&i($jw5jq(j~v^fR8xVM*52SM@@f_<xlN#`+*QS
z<hAGSVEIJ%d)qjkirmTN(fz(QYJWw=5pMs{eJG{<P(p(Gbvb`I*_W5_!#)pWDtL6~
zc$4FuxpD)B*8e8FD4)aeo}B-cF+G&Q{pb4g<N`0jpY@_V5aRyZ^hFBji#qhd|JUVW
zOi#q7XMr?-!dFl9=@Ay7oxWJQTRZKE+2xB!|HzTyvh{0zLE0!tFLKi0&nnYfLGeQ+
zQ8BU=>B<V4uNs`@^0^f6csYGUNBLY@f0XcCzJT8ilkgstFQB(_UFA3+W7RK4{fJKb
z!{xoj?Ej9D!yx<m4)n)5Z$nw+Z_LsHeTTMK`u-HH9~ZIzA#k$ikC{Q<-wG=G)pPUm
z?BV`gG3#SM>EEOKV2>6)=wB>X!t}}Uz1r4io%EUISHk+r>u+Dh{BXZsLi#&p>!YWH
z#=}z5N7(m)%rfbZNPZ6sq#O9P8#&(*&<F24Vx6w^|IvNGX6bKXf6)Gc%w>-LF7lv#
zztcYJ=Tb+Xxjf*|gZ|y0D{|}+^dHD9arC|3gLJ*4->82y2XjOZEc?fKC#KiW`dbbC
z53P0E=TWna_mdoZxzy?ZBE;5aI79(I+cVR9iNg<+!@hZZmHb?ew6BC*8S@kLKe`Xj
z(&7)(U+3_H>95V}KkT3BW&1~c_`4vvTc(fwODIJ69eNf!{9yjM<O_dyAhXEDAEMVK
zZ^BRf0!56U>8o-2kC$r>WQ@H2=F5Ni4E&5Z{A7A#4*$^JflRg2f29wkPs~527cZPq
z2I-&D2XZKmepWgCW%?@f`j;o~*Tlbar@t)EV1a(d^7<d$AIj6uqgXGb8Mnhfp2<r3
znZDwn(;s|^B3Hrkwf2XPn#BeB7%ZX@*Cp>_TmP7UUO9bY{w#((=NHghRN=Ho_I8m&
zFWa|YI$ZY8QOF}S-+?Fk%9y_S%D$n0Jb!c3ldpfe3>cUm(ys~^J<EX~>TAoB%I6f&
z<8k`K_JHY=4UaFI<mNIx6+{oyACS{orl0KJH`fiXn<aVv*O!yz$?>Lont%4?Wb?9o
zImRb7TUO&!hq+)G#38SVt)j!6?@EV}zNo<}7WMZcz1&IfDnq*QGP`^ttaX?dFLk9+
zzN{eKu*5EhyjPhe^J%=|Nyp$tfM4L?t1ZL$R55=!$}5Oxsii2t>{7e_1kpF&!8Z}s
zL2tcX&h#yJ(hn28OE0m@W%~;nk-_cvSEGChMb`3S{Pnd~lHwuI!wh4328#i|z@g_{
z81$CRCk8RT)N+(x?6g-;?O(ds#)t7FzkgSw{E|g>x$K{t9_pV%5BIOG#)en@AqG1A
zsWt5K=Oz6l5EuP!dYFD<sEdAw9;SaWM+b~k(N7GP@*#fU(2I3R`W^bDd<?5tF#R=%
zxYecTCx$xo8!DherlOx1>Z0FG57SQ!bJ6e6!}M3%(^ZO3i2uY;hyI23behhmLBAaO
zB>fIOOn;^H_rM|bqhtXw)S<tMB42qa`iY?~`rY&}{lqXyKk2)QHwgpNUqi#Yyjb5V
z<{L;k^p`vIF#U!--A8|*Kl9yb;HNL-sH?Y(>~R4x)S*A-(9i8hF&d~#*#j|{+wUiS
z6;rq+FK(ZY6S>mP_LJfl)6eaVEJyz<=2QP2{>*pUllE0f43qpJeY?nMkLAJV6J&dm
zKb0g<$sh7BSkJXCrVrzvb=8-FKa~aLc)8rV9%TM710{dRzG(wSUEDq&x0D%@Kl6#f
z4u9nOECaAUl*CA2vj5f8e`dJT{}@eY<fZyg43YiciuTKWR<URx`YI}kAzXiCImwed
zDDhte{6TzmL6yI=xG=ugr_W~rZN=kHQGAa%fYl)E2l2a)<2SWFK=D4MIo{{;A}=YB
zwSI7au7v0J#cF<!_@IdRV~xKbFiT{7K=W6wALjEPPW=d{OK83T{$l>ClfbO`0nfkx
z1R2yQ%B}gYGhcAcZ{>WUjrif3FSHT=ocSwCao&vafu5sgDF{>aa=IkU{X6q=Sk7nb
zBztK5eh}*}`zzWgzG3{>k3szi_m}n;L0@!#WgEpqQ<1xAK9BNPgxcr$7v%>swN89?
z7W0x^3x@-|-hlR_`>PoLii&Uu^(%P(WX0o*e@+|eztx|QnI$Zr5;Y&I%PnXj1?KtA
zDVh(@<uu15Tz{dW?@p%gK*sOHGnY{ORjt~W^ZmNqe6C-q;;RFh8XDeJyy!!J_;iXZ
ze)Lv4@r?$3DtG2XXy2EMb9HAv-ISNc{7CwD6`$Roi#hYjr)Yix?ppc_e3<Xy(1fA~
z@o-&k5!1Vn=TB5_EU?S#iC%X+M)QTJE54+3`64FBj@M#~oca{M=}x>?UyiiLiRU<e
z_T|N=h4I?a9JZ&SQEVT~4_Ew#`NUDPVlBzn(wD;Z2dv-ldbLwulJD?Svd>A==!%Qt
zL(B)G`#YTV6X@rCIX@P)(B7-}^7@l6N70jl2Ypz7sj%aDwvRb{zL?wRw4c`l){?!j
z{$B2^$8i0*j{ZY_z*k@|oo&2-PxE`6uZiw&SM=S@`svF_aajF{?qAnN9Ao`I7p{oz
zU#Z$-dx3tyn_Bk5^p){}Qj1@8xizdGok~Bz?+S9r4!<v>-b;Rg{<_>MhHqE;kND7u
z-|DY{zpkZlLteY?MY_fvA0d9OueXY2{L$*f+gK0lFb%^lUqtas+)oA;C!HRt4#NNQ
z*V!s4jrAOi4^Di6@!OY^=BW6C@eVuW)`j(1tQYx-K)D`7^99vB>BCpIqI^Mw%6XOS
zUBxI@-I4l&@uk7vit+|(fAR?6YyH%rT1q2chX3fqleGVbbil`TVgS>BE*yltt+C5L
z<wyMrFO_qDCaJ%bx`8y)HyI9r9xtVtp67`kO5w8Usnv-A-2No-$A}O;2J?&XHR!|u
zruRjnPw%wtx7Gvt3@_DZ__KsxiHGn*-mvF2Hhg^m>GBSM%l<tm^yvQ1HX09|^^<jN
z<Um;;dA_&46G;ir@~n>#|LSdiMcVE1dXEnOv&u;m{q<g=-+}jaIP_?M_qID}@<;7X
z{}wHWzIL#_c96bee;`laPaOe#tJ9weIv=pokq^@O`bGL3ThH{-pWZT!^pp6<@@4*4
zL@3Sj1V1}WpA)}aM)<)$>O1+<S|9PB_fHMVn_Le0)%vNwjy}a4{<FL){e)2J2g!Q@
zAMa)TVEyy0vhmkz(EoPHU(})X&7--Ba(lf9{;$JqC2`2>>=F2*IgY=T{-T1$Z}t}$
zKaZMo=h*%a`T_Z{0Hu5`3U-*0l@>a2PO!ZY0jz(pC-P^GzJ#GaD;@uZ^L+Q^%EC+^
z`Co2tURls;U->UP9^rJQ<4-v~cbY$?^<&mg_NN`H{k!2`x5B@!YpX;#>+4fA9$c>I
zf&T=5^7Q5GaxdhA1#=~zbEH2mdw-DDSEYZH`bPHSviFB|q+R+wO}}V;!R4<YFY*s0
zAjdx}V0&eGPllm?1}~Mf{;+*V9DP0)1pi(3^WsHl-{;s1?3esK#oN|+K>mu_Q~nC@
zUPoWg21!5W*yF=jq`xA4XMJS-D|hT!%71qMGa+EVF8^{?2md?Se@K5)jr2Mm53Tsl
zk_V4JUtJFRVx4GS+7sLF<&OQ=hqVKl4#%G1f9i7U*&kTr9oxHQkC&0WTBUt+yhG{s
z^^QG}y}0bT_8Q3NJofnFA=G!-<HI!n${(-t<h_gh&E?LwMkdOj4;@mT)JM>NG>7^S
z7|Uy#{gS@8{E-?j9e+gj8*}U%`hL_bDbKg>(pbKIpT|FqNc$&(+5RVwz#m0)B3Syv
zF!pB}{8Y~N-%I>k>G+3w>aWW`^%MQItbcMmllny!l)poHgKqniNuEzbL6v>H>uTsr
zhZE1ABL8x^JAOxehvj|ZpNj9uen{Whe*U!>{E+L%%wP5=MJ2YsfxpWeFCHv|{e%kq
zPjP|&S?|oh*uSi5BYZp`Vt;gnGauvlZFL*XpLo8*{-=}mb-f+G`yKr}Oa4a2Z>#Kb
zJ#YMk|LFuemwmwAtTgnA`~&fm<;VV`-LdyYVf4>mp6?$Pcu5}YfBK1E0iM7+`x#!B
zz0mmVbL<iAMfb0F;%$t_(fwEC#m9N$(K)iOILBKNve6e0A$}@%@IS=sr>rmRA5+D!
z_lPt8P0;>Vo;~M}hgc8F^Vht85p~Afvs5nQ2j)M<qXU^1hrjG^RyzKc(=n%g)F=Ny
z{FVAk<NE^EU*=ySzb6X)1H~Jz@p%{dgO(Pclh=dylKeRSllE7Qw88vfeLlMs<FRWz
z!Fo)G>2d6j*Pm;dzie+QDrfQ}{z0O@f#VytU$*y(2oWI1qpcLLcxiot>yK0yL0(=e
zSN4zk3pl<};}y{tA$X>b@ip)Q8so=$at+2K6>-z~E{tE59RD%@V1G3jf9g=5FRX7+
z{GlVk{l|Qu#+*C%5>-s|uNu>c8`$jSEKb)X5O2HE^X6U3h4{ne3m2*xAEyn|!~4}j
zbP76OY?%6-ISh|9)~jmF1q<v64A;N-;$>WpKb)?vruBWkIK8N8k*j^f4Df!pg}=sJ
z+`QOTUSrOmj~UpQRUhvc%%gaQ;el_l=`{?bh^N%vV$)Yc^9kY!r46$jr;6z6Wcq8&
z%1T7|W0WK5tEyU(m%iwtrLHvk&-?9Gd+7hgwBPQeE2`xJbtmJ;`O`({pFF?H^aK{8
zKRkp=`eO6vBTZwHOh+m!@#xSWt*S!WqJObje$hop)0~3#kLY|3FYwCqIeOJzE|K2{
zK)h6yYilE1Q{=RsqVp+psOR$h5cDVS{0`D|o(MA?`F<7XjY7ZLke;IVGm*Y0Bk*-g
zT;#ORdeFZ5nmDx&|NO-azmAGRM70kKf|mz62!6uD^&<wJRsE=`^M=`!LFYrN89wGJ
z*YW)fZjUzUK%aeHh|d#k#rc<jpX0AD%JVfy;{|k#X9nWsXNW)ZB=3nn#Ao>420q>(
z#5_8$MC)IBklq1)tM`5AeY(>$;Lv%9pE5sq{y#$J(Gc$zzW)M#RS;<->wiP%k-YRi
z%Ond6^+7M*Pp-gmee1lUm*7dhBfYQ4?{DDyBoxo9^Y~QG@hiVi1p2DJ?UnBjko<s;
z&MS>s_(9*@dwo*iZ%6qArLkUT;F;2hC+^-Gs9=HH<=Bthp~`nrd9;GH-!8}T#s^gS
z!&F{gAx@|?<DXk0&RA(GuaxUu+#a<*zk**Nw$lqL#8CxbPvr|M_`o2QFCu^B^HJCM
zg8c8^Yg9l|F8UW$@JBBhz6|hz01bV7;eFV<_tsX3dyt}X?$0F^;*?5bJgdR^Y<t1u
z>8*gTuV95Dc<$e26=H*ee~{qI%hie-*29hfzM(?gt;#We5xtd2^7V9ByL)eAg?I`n
zrjPm)2#`JV1$&|P)e4OAFXiRZ{{Vu^eLh1%BVEn)7uji9zt&1WwH)Q;<(F7#RlnX&
zZ{_;%kMdIO+x{5yd)Q+I&6k;{XEnxW`*-w*$~i*X2$%_t;iF>;UZu-Vs5IV}EPP+L
z&_2i>9<cP4^p~8gmB#qX`g;cWh<_6#e_C&b2cmSn(qBWRS^uEFohr@phW_qwmqUNY
zRGQ@r{e8e)4*fl0rIr3ps`gnwp}%oe&S~f`UiFal5WUdfV=6tH{!Uo<2tV`}>%LMy
zX3^hWs+{`+{e4)aXVc$v3P1M``dh2uXVc%5DxXb%A5!JqpZ^5<dj|a{{hUpI4fMyY
zzdQ2N<)jZ({vXlbl{4tCtxrmS&nSM@EB&R>oAsOY2l{KMG|L<M+vzTc{_ap|mM`>o
z%v}!seZWd9{XK29NBE(?6aljRP#XH%?M_2~kE--+`g_X4r}Q^y!Ox<<8&x^?2l{)D
zO3$XhPbvJ|Kj<$-yhQJ8`ny4u&!)e3tMdOW`b+w9KK&Kwk6V8`^V8*|5C2j9{qhX@
zYwMHJ-xG?T^-6!wsI=vODLN&2vAm(bhDvie^tV%`S-#NU9qw}I@0gWV`g_c3PwDTZ
zI}QDfD|q(D(BBc2o=tzTt|;|i>F<O}FPcSvaZbzW5A=7JO3$Xh4=H?>{+?6m+4Q$o
z;hRl=Q||u!C-A?dFXz)=(igY>l76_-<)ja@=<mh1L9eWf;)8!y@55Ch6I;LU`QPe$
zSwBV;i0^mN`_*F%52XUUGk?MRC-^=_;rF|oc%90Zi|Cy`p3n09<23YtT&-ua|HSv+
z?#geE@S}I~eTy0J$M#QEcu_v}>KOJXc|L>nH{|gAhvO-{aGjTK3mSRpIYm|)<69y9
zckiX+fPg1{u>E<b#0HgSee_L<dsKRY^dGPLKBLlXKlpwS?T|1(sT|+0y3R^3L>lk6
zeAr4;IbOH@o;!`tZy1Q!?fUrsOG>5LzVQ7Y>*Dwv^ggW0*`DP0pBUunt-ufZUsUC6
zKc#%WM3x^xIr<aFjE>r4`zcf3YeAahZ}cy#%Gn>3253KpFU&_V|CthxxXa7bFYN$=
z=Nst%t5!Mrhfv6_e~RMoqN0#ONAtxRv$&W<$(Kx*@N|(fWZM2MI(lEQ2Jg!_<rJU$
zltL5y5v2XRf2HtgG=Bg$<i+>`{C)>#KtJjIQBaHZOVa;0{`+auzke2#wAA4S`POFT
z`z6-<dU@YF!+fKPz9&Lg4gO(&8T0L+fyZK=Z)3l4k5{Mtcb;#fe?0$wyOYNL3X~Ps
zzhIUGea8MQnVU>Q{`h{U`hFGO-$(!B_?{RP?g8mfwjy0Z-`AAsXO4(s>wPkM9}Vpx
z%^xfhUmp?dAK*XaCH+SP?{BmGD-G^1egBE}1NeJS=zlEd305P?=a10-o%p_%eIZTX
zZ=f9h9txMkcC3rbL;OB{P`;lNsoewpgn~`^?`3&b;roare^pNJmleTV3Vy!|_3?cT
z`Uy9?{n~eXt?y;=`HcDaJ|dzEc>%um-SYi?$cxMIeF3cms>27$UX0TB4sftar4bM7
z*7;#-1oEWyBEBe%^`<K@pOe=KeqZ&C{yyw<{ZrOQ`UL*xeIFF|ZNINY`pNp{_ZWB#
zk-p)3S5^348p(^lHwpR=><#kw#>8{-d!?7=d>;G#HSjBn?@3ULQ$g$dc3$zjskM^d
zq~AH6@%hyE3Zk%QQqZaA0Z;rk^}P2_@%;^c|BD9=_`6GULC-su?)dNgemUSHrtanT
z)c1U<av?ftfCWDMWqjT|e17ZuW)+6C$DGG^8p}&+{mNUFD`xsE{kb&f_1$Fz@AMKs
zz^|xT93%^!`h=pF*2AE%%rD|6+9!T_sekA%t<Q1)-j4d54oP~yqv~h9&inBwhrJim
z{M&wi9{Tu%e4mf`<INQFP$+_+ALBWF-+}Sd`XAQk;15FNADCXqi~JpX6zhHYz4Ci;
zmi*vPLUdk_;lVHZeiGTA{J!W`{vH!sC^drhKz{~zx`=<!7kWRRFQNzUi=#dMKH+TN
z<NW{h`^V6K`rah<hy5?}+pB|enWpm`^!*3;Q=G?C?-wY4O8d7DmO=mg&ik5dKl%O|
z@eTVIUhl;BJ@I`9^}Q|lPw*G(L;QUv8P9DM)8e)J)$j)zcs=rdPqFC4Go|YhPuw3U
zw%7M|Q5vroQirG<^wr#_*1nOX_=(PQvi*`if?g`;@sG<9znLCB;LG->%E_MiI=7Yh
z;UWECen9^B{5M4($D`qC^dGoOx(Dn1L*fUK$KM;n`flF$jlqw?^{%_eJyYCvCH|s6
zwEktVL1TOYJmDjM%j>;Q$ux~Gyxxm)<4(Uvj{lwf{RrP(ep;@!zc&}X)59Nhk^Tw%
zxhKH>g5URqK3M$X?`e|#jLUez`y5_tM?85|l@Mcuk0xP=m$4s8-IM#F4^IES^4%0~
z(s)7rd8iEgCl>!ZC4H7XB@DKws41zp>4X2J?(_H3h#q_oI)GnJVtQWd`U~naz2Fb8
zA6n_xO22^$Zz%mcW~G12N+u3}EdHu+PuX9d{y<-uzwvzfWW>-poxiCv=)0T0%OP(!
ze-R%te~(J~z^~tTd|r3-idFZ2sQ0@kzTfi8XRp8X)g^N_Tr+gnuOF;S8P5OxMqR%h
z1Y7<6kKIW(D84DYI#tFK1?{jYxq!)g%~W4gZtX1;x45pt`n2yxLqp-!trqb=EUklK
zE(MT^hFP7c&YhsI5GUnNs6~EkdF5S&uuEj=NFERprx(ebhg1a%jUvYe-zJ}?)z8+-
z0&DMq>eF*!{eP19r`12I>JLgZ%wyW;EUdp)D(kfRud4bfSAAkhVf~fTnNO>~gW@?{
zcgq{M&vA)LTemBvK&I6{r|LiCs&C-Au>GIRQvaSyWW^ow#_iY4Q2%e`U^A`#2~{6M
z4`0N85@})k%~HRo)!#tMhHIC+ar=w$Tv-1zs1H@2R{xl)e_GYIZkGO9_b0%Ad%U!O
za38CeiB6SA^b`NBdJ&bs43%f9e?rxdyXq7F3hUGQMq!_*%H##=lRKyD2>L%$`^FMk
zepKF=f7DK4`;Q=xOrX#Q98>joNc&(cGuh8)!I+uqN0xFG^(|rfJd^$4bG0+pKdLtE
za<2AgvY!t?elxWnY?N&p^2UE>vY(&KQoml+zsJ@7O!XfCLuYFL*mBwM`(5>kmO@z%
zL3lIOUzL#cY3t3pYVc54|KDb*|EVTf%gw);>@ThSccJ`iTc+2a$^NJrB2kXYvJU^H
zLld~h_^%wJpweC|t+K^)c$lgFn5xf;3mCFLsASO??(Xa3V91R1RU9%#6>-t+GR*qN
zDB-%IO8(R6UoC;gl%Y|5sPuv{S6zC%Sjk_$wC)s+ZI{p1@3~WbsPvU%4hTOzYu7I*
zc%Mv<DZx;EoFl3nD*(Gs$-j{PvG*%Ls)&njP|3&`C0y4+CI4ykACW*~O5vzJd|BC;
zt1dl0jDN&`)eL3H>x1&9+<=ukO@IF!C1Am{`b$-PH9NKIQ(NxKMCF<4H>mnts45t^
z7uwI&=>PfZkEyzJTa15&^@;Bk0>u9Ue`dHIR(aN)+M<j4LG>&@OL^|A`&}|+-QD$A
z)>i-AmGG?GruYX{!KtTd8&Mu#FDop?y%UcbW|LRTEdCocTnRk>=5zV!@*++A^4i8d
zX)`l0JY=kBOcXU{d(55btdZHBT+v$8*po_`=}ab>8Q7OL+FFW=iW<|oY}y<$8v93w
z`i%O<ksh;Wx6#<!++<Kr(oARf^bE$A7<)#{fuU@FQGHRZsBLSRE27DAu{IfRxL5?c
z6GcmkE`JCgHg4QGINaMaXf*Z=4(uE<62R1MByYH;ExXGc&1Bm$do$Ve?zU7q+cPkj
zh-cEbjHZXO13iPyc<j0{ant2jh|(Xn_l>4{l6|AS%^;@<A0^cjO(TOned%4pgDLc`
zDVb;!i>jOGPc!{#p+Bwkr;Yxspg-;OXJwl(wl*gd@f9K#&gmsnMYWl%d1GSbO~z{D
zw&oQLMj{?>Fq)ehj8gngIjqPlv~~E!=PSjQ1|xp^3wo@j(Gy9Y)t@!9$$_Tfk%46Y
zjZH0gh)G{tD{8m2G#G6_(*`Vw*5;U)46bZ2R<<`7O$j_CRswTVQ=-9W4T(ul3#v55
zk>8Fg&A2zs6Gls-Q5*vX)LDUli6g>DG~>m*mJOn_t?Bm5Z}$GOY#=kxJD4^a!7Otx
z7}^R+rp@mD?p@oG^`+j+N84H!h_Zw?T#MOkVmseo4mLq7w-q&->77R7h?&MTX=GK~
zn#*J2L;a9%UH_`41U?wl8xvQWtD07Tc28T2QD>%BC7Rb<j=hD(%&wk|!t;>Vvq!7C
z;f#1m>uO2$WP5H*G_~IZfWg!%pY}1a4^_~g8zY+c7h>5>moF35t;=9RKk=5_AgbH&
z_<<&X?;hUMIQj$evIcI{^+5qrjk(K<GP?$UDHgRhED(RydU;Xbo@`HJcKE2z`>2St
z*NEp*D^PKCu<_P`RJt+KEsWI4D)AYxvE}k2@Sri%q=_os*CwJX|A)~3+~>JfTL=6%
zUMfO=F5U^|*Jb}qMB3gW#0T}@Wn%e?hMeZ{926RIn^%g+3ZK~41}d_TY1#oX8M(2}
zY`zJ#`V*HI_4b&JnY}QJH~`Ywk;a|bbs`?B8)?uY=BVc-9RzpQjU<d-v&UG!KAvp)
zj;5FC+FM1UuD|JPBG!CT!xukd&GW_jrEM)YeN^ZkEz&ZE#R^f=-(+Z^79oDCb)w0o
zUAnkl)0+`6YdF-quoX`?>Y`Z`_2YMzDXV&gPxG&92iLzPsyqH&^J*cjI<){_{QP^Z
z$xO8^(2ZRKYqU7zw@;il!Gjfv*7l|q?d{DYdb#;tU2D^?g?cq7<_dj_uD9u8p3r|K
zbet&rIezk7%wPFwt>pcBH1Tn<pyjYG(wg_LMNJ}cRu|Z_epp082X*a)C{JM3^MtNt
zMKrZlG^Y|xReE&jmwEzxNeH8<p-JxoQ<^IE>K6Q<bads1J>kV(p<kvK&Gkg%OZ7<V
zN?m*pO>5dWMWnw;YwAzLzktoAp?*!=rZ<J^%o-uSBm6yDH1UvtRf;}MT%qe*kjC$)
zV9>7>p(dgILI?w5)#hYzEV2E1-AJ`8_ZVj4ETRE@QA<aI$5*at*NbS=K8;9fF4wBt
z%HiBUBEl7#XYJBN+dNIv*LzClY2ujBHjAwgdvm3xi*I@({eLF{m6|B=X!AvJr50U*
zuXHwHi765H`aY@05?|KEodIpV$MYG@TdDOfO*A#Nw=dMZn>-8J7HZ;?;;kmM4>@yM
zkW7e@-+Cg6S3Ti{nkUlyjIXp+7r*h9q3ATV423%_!jOr6E#%_$y(ns$;zC=n5&*vp
z=-lUEw-#;MrK8gT)O(gTwY084Q<0`NpID_YKvUYNCU*IvP1V|*)@n`sOc!$lp^JRt
zs1{zMdm}B&eWivb?)2cV_Po{Ml}JRI{<nTn|4)VfW$of-L(`uW=<7<esm0KuZ7rx0
z0s{To=C*b?2u&>YVBPTXM7*O2!^7o8w-;>*Y5(qTA2Ek_4`Vdx?(1$#CU^A=_HRpE
z9|~V0Jl@^GlGUMbqbMr9nQy_-&{%Jx4a%BXwI-y$H>7{fi*G^dcLdk0YF?QQMVhi9
zyf&#_RpuQJijH9TE>RMTZ}5gz2=UQCcm;l<>TTFd+UKQ5@j*|xCFIeT>9N$GdBfZA
zBUCN=@>J6r;nP;?^{K?K@Gg`;{221_LkY)=gBXb4uX{F!{J-;Q_X%yesBHe2uDwr`
zenW>75Lbze3DfQwN-ZrC@4#o`+e%A1SGF|A7X*ya-m3%Nmjhap@Vw2dXZ>Qp3s#9|
zL~v11|9T+2IOy{{<QHwF;zqCU8lk=9gYbgi72;99?_%M3msmVFe7}e$eoY}ueM|e_
z3eE6|j|Gv^vjN|UU^F4L)3DxUiRSjUrg&TPiXc9&S)~00l|q{55)r&c)3$rHgkSXe
z#ch5apXA(Gs)H||4ER5!X@|U?n|w<X?eD>>#6JDF`1rD>M54KQWm{YPy~2OMCoUEG
zh|pgs)}HWchA6rR_Q6(Fv`J`w-E%Sg)q90+9dxZ!>u!RlZR^l|FKMwRpQil+xhn$R
zYfECdiC=rQ$2FWd)yj0Sx=f4|`wo;uoADAeJ}tbdL`!Spon@YNdVSN%R!zJm?8O&I
zn_9I?dQ$%bZ5+j~;G8L|X^#I)ENO<Q@M^ktsVHy0E)<9rY2Ogud+@?)k@i)%8SJY1
zOZ{p8j?s1~O!MCb#AL}^F`)GJUs@#QlKilI6pepN==fcO&xOPfL^R|tjRW)*UrXpV
zV<86Awnaj;`^3*Yad`HCKN1QTmEKxf)7&zzRJ&T&|3>KBJkiuYYD-cpO@B?I?XTea
z;zRz%OyWyIUtd!4C87OBh|fjzw-<Z=M);7q)hoUd4C4W74~zPnz9fPl7vlTEyVw{0
zxbVc_Mwd5TUKR*NR~)FQOPj@ki{r)64<W+AfG@lhvE9QJJ25&B4);A&5o`NcMKn2A
z97v|pBg5II8$TA(f9j9)en?xotvf6#Hb;c_gORx{ulmD@fNyT=V)zDKzb~Tq!-e*X
z`u^1R0KVhvzapUhE{p^FS8way-0*3kJ*&lfzAe0UBHC~nG))g}2<W#3!nIl;oC^fP
zEudmF5MC4TXn*bz)dB5sq5VpT79Ah2720Jmt$@f;!UHCUZw>gh`CjeTK=F?w+I61N
zTRf4rCq154p??TJrck2)qbA@hR(eHg%wu>W{g0LkY@+qQPn2iQdIA@Fv@vfqg@uAW
z;lOUMSRsHztcg~y*e-PK{QxpO+Ov`PP<r&3$Mdy_*dv<b{c%nBzFX!6+uAk!CRK2;
zs2w&t%f$zURvC?qe5!bE&GV&^ra94DFbFq}4h>*iFtI(d4k2Pk%%}f1zt*UUPiPuG
ze0>r5QSIkqJ*=zke|bI2%Hco6y?Rv~Ltv+0gGcZ5*JK;_d%!4Bu0>b2c*VPer5M4N
z?kv_m=J)3C8nN(vzRdqzQ1pmJW&*E6-RkpQ@7HhgX)QwkjJJ5dC|`<09>;`lc}PDQ
z6hA15wjS}7v_PGjK8sf=f+j+o#{RaJPifKqcSn}>x6BjTUuxoDz>85}4?<dPryg0k
zE{wSHwo7Szy}i>Traa=`eLW2Z1+Ux1oSsjTor<IfBcXVq!rMMaJWzZE0_%|`UmVfi
zLZN@7LVwPyE%xf~*J!!<-QJqejm2V7eCcj~B=rMPipXPLu&hN?BwED0_GZzHfG{(^
z2N77$;9$RAj?Y89bxDI!v?t*2?Z>^o=r*ya!3c>2V*T|V4J<=Um57S65_&0g>F2$j
z2pDg_9kTG2hxeBTKc0y9rh9hx51YG9xD~OgyMN&ObHi~_>Uo<MSagy2tsZU{b4&lU
zG&~|I7Bq=6BrB?$+ePiXmUgkcxoJeqt8RT_o|u~`4tTbdUDDG2jd|kJy4YHFNqc)D
zp#MORuWDP-T~;0n%=PYC;CpIb*rSym>uSx6?%s_UKi<?kC%j24n=`OU_&%Z|(h_%s
zv>!&oM?|T&Ll3`DhNZkncpY5GSipa$(E0=617iN-)|)naOA)_Z<1Jq~=GCy_TP^}8
z0^X|9;4MA?>v-wmQ(g@#Za*y6E)M7$gy<466Ens=-jd3)SmIEnf6OO7>(}sk$z{Rv
z!2c`}NueW<{%MK-x24*KfEXwXKQ<Sk7Pi~2D=T_Osi&k&uP+XKM(98AN83ikg4XN&
z+E%~VTozgRr+(jG2>tVTqt2)OKkU7CbX3(IKl<JKlsPk#%$bs2$RvbN1DTdgs0k%>
zr3)yAkYpf|gph=yq9Q0*Kv6-kqbOp%*u7S4=(SwAR&4ij?Ojw<Y{~Dl&zu>Oklc6Q
zAMdTV-nwV4b3S{2zu(<^JxLC|HyS!rq&2TOlnIIa8lq*YtEiYLJ%NTY`XYtR!a+Ln
zE?bF>Pp7H_O2S`SSdo@jyBs~HFL10RmA=L%p7d~_GVqnfgLf)aLqk?Rkhwx*h`AZx
zEb#$M5?#G3i2uTzp-jg@^pRIJG;Ft(SJe!zx*?sXd5^H#n!(k}>IXOYiW_~!OMS)9
zsOLt|rk~lTIW(EmM_Il;&BgDd15rb+;@|PU&B&2G1H|I9##xyUWd~O-Q^uke8r8gP
zRSV{+W@V#-cV`QO=a8%9C5Nj-VDuea5^N!Q-^QPpk?*xuwEv7`M%QLo|HLn8h@D`=
zK<<deS#pTHkrIjHvrK%><j%+^tf)_<<T^0I*HKv{>^rMK(~DTP{vGAG&wK~}Xrknd
zvkEvpfwoJg0?v|{p2V}6li@dg$Gy4D;k#IV*ncBBhFh%=EO3PnGiUfx=53CBoXt~n
z5R&kZlAw=jE-k|Pz5W}^bcPpmS7a5c#^YaqBwYF%!ykpR^f%G4{Dv?xeG5;f-<72L
zyDWT+2Y2+_wr4^ivO(yH49iTJS9s)^a!zR>&v%t#%R)jf+%4VDoTbm9Uk!!Jc(?Q3
zrQt`|5Pwl2A1DM_PM)ae&n5PCu28Hex9=G@8ou)>-&gdgoNxx|L7wR<waKVAU6JEo
zf26b|$k`yQ{S6{Lo4NG&sX+e+&$eltl`}RXk&3vk@qAeN3B%`3^nz`@>E=WVFjSzg
zCpvvz4Q}(BS(5LcirfFNTOT18xP3v3+m~-~`Z5hJR*D(g)Ni=Qmz|R2e^^d9lXMr)
zqH1{SQts~N9gSf{D3$AjSibYjw}`)n5>HwB<M11*_kjvuB-2ZFTNbgGm^4<Vy;=Hl
zHP7ukJ=N`-oa*$2#NjM9Tz2|COo58(G0gJ$|B#Xbs_ODznWqou{df_Cd*Rh!psx_)
z4a^<5km$8!N+#ub*$#gW=b4yhm_zP}9X>6Qy1ru~=%J0|`e4?Z0N<c5IFi%ZO#hN+
z+WH|Us#1-t_v3wOZeN;^z~{)cpf7fUv7J2E7rdCW2m1<_AH`7PHzg<7st(Na1^<Qq
zz7_e_){ov!;1`lBG+J@`zBeQ<q~y@Q46pS=6r!(sXCD0n-uqbY`UWCC{~b2?ET8%r
zKGWageQX0beNIxN><oRGuHCCqK$4$J9O4f(%6(j+YBY7B6DZFYdWTU(BK*l0%G{Oc
z_PG=L_=<<9`tJ#Y+`f}Qn2UE_VxF%U$(sgsI^GNjm3(1%t^%HWJc}me)3u1VI;HC?
zJoih~a~08O103PG@E?1V$xX68fEVy9P$O@3IDO0ULjqzi%Xz+%3hpj>obi6bsccnu
zCBi=qaJv!jvP75f!Gzpq-$UZ_u=xBnA=K;(e`WY-0E$BpTO}jG1>&<*d`=gijpDOS
ze0JfF3NC4fYNRK;GKr9+xGkRR2jRC(E`DQQXJA!2d)cMj*q5E8y28^F)fG<lC$grH
zjA7>79CeUe8p0AVPMJpz^l-115-}%O&mDtM<X@3c-Yh5d=P#iznWT-(rGz0APSfu#
zV6-Y-xjvWU8og_=d+V7#>Oa$2HA6+Q&{teiQsN))up}qZ!z7iE$A1NG)c06^>ysUx
z>*2R2*{9fep9Bfbnv$>m%fYIMEl;-}a<FqqI@d;5V}QXp?z<ar)6+aQTdJF%5MWEP
zoM&E>OJ7TTjg4QG%g?a!mAQO^l?QFkz;-K5%5?jjHro!>p61an_E`A}8~y2}kGu?j
zj@mD~0xcSU#o`Js*POu*Eq2uJN0Ajg&fwiHJsm%DXd~ODWJ&k1t5sLzD!e=JZpV8i
z-n|}I<U0u!e3nJ`u^jy{<#^6~mvjqd+0s-tPgQa~IsVbOvpk3Cp=8AsyoB@RHfQiA
z$=TfILme4RQ27We?+;pqh275hTAQAYx;*qN<8?N^#739<Ar?9Z^^=AFLvx1KICbnw
zOvH~2Kf-@dx#|kOrXtM`iO;ucUL@#Cb_Hz~=+Z3ES-C5ChlW9a4$V&1KcGx~C+eSU
ziT-VM%ur~ZNL@N;7vDA}z8A{kdxKD``xKR7DzGxdylA9@L!a$W7fJfAjMN<e$Cwp)
zP0~kGswWWfOEYBB?Ai=jrA&T(8Xd6d3oyGgI#a(PUE+S*sG8znE-CzCWk^l&d^EiO
zVmL>}i!(G<z#ml-=cn_R6=}3e?-f{<C-4syc{^55$g)-CgPe|*Wd4GJ03<;1fg;zF
zt*9^mQBms2o=pr_M$e+e^IY7k%Cjk><SCMLM%VGgY)mt$d^RO4P2{KO(rii!_Sg8g
zjJL}Wg!L?iv7aq2->}r2<`XmFPsQsCH|8m8GhM#LsjlXQCF~cBZ86fy%xkVWRVnp_
z!>Gp_74&^E*YjR3#+lOf#LibxJJ|+me1{_EacA)CfHFvP`WB?=g<65jmyv>Y=DTE;
z9v0<)1tyVtrJzvfh>ur%a#QFQ#T~rM-rpBqn`*}u5VtA%CKa{iWrmS28Rl&NCKomP
ziykq)cgQ}dFCw=;Io0LgkVI=Ro?V@+t|2{7<+iiPlCQDng;}aatvbvTdvhbcH&K&C
z33S+w4oGh{6xqqDe~SniuF<oyk$Z=0{56ZbUdHJw{hK7!7G(DMHukG05ALSA@iN>r
za-N+($++Dv^e+Fe9=eBlf<yW!g<QXfW#eyA?#L()ocst4&M5ey^GhFC_4}zeM61-T
zOdltux&3K2w|^q=)YM$RZ-wIY56={V7=e}=beF#t<1Xe2TvDKKWqsI{GAv&JSLP;(
z7X$0iemE7?353)D1S}tx_k|nw>vHsdl1*Ce;NPaW{aI;F|C@;-O+HaP{+)^Z3Mt?3
zZ?GrMRv?zePPDrtxv;tOn-{_CHqT9hM}+rjDCz5XZ@ARA&$4Epg}uXB*pjeU<0})C
zfW>;Vs=vh2`I%O?e<|KMm{`iSiZJ!IBTSiAm*1xboBe_35^_UD;Y9UjmA&HC-;>gL
z4#dlJxBrOd^8X=2_(jd_-+{M5^ZJ9kGueAmW(X-ho@cQVn>XUiwj;{fPj>!Ivhbu1
zbVs5u{x}v(YOlpe>Pa4U9n=4A$+BK)Wp~TUF1udH`|=9i<xf{~oAGfq4=$%sI`>-m
zQeunB;c0oDD<||1Z4m#PgMSChkG*hE*WJEs9due1kC_{IZ*~u(hb7^Xwn;%kr1Qm2
z;RXU!=XCprrkk2c1ww(SDBk6E2OdsSpOPS*#WGXe5%FrN;)OfcrSPa)=xII^QI8<?
zMuM;hxZx0zX10>fZ|m#wM`Q#hf+;lrXF57W2n{nxk9mYsUW-WVtt^Xuoan}Mfh#gB
zkvzlc8e}6Gl<13$Rhp39A;IBd%qhgqG7p12Yjl3TN6xk6gr+C;artK?!`*X*<8wX2
z2jbHzKI_EiTshS2FUhs=l_}~%Nzb+Pq9G}?p6Enj;>9$49u=RX>G;%1_#|b(L^#)?
zZ%G)$l5}?@Nk`@{C~!wUgg?c5z#-mX#9!2aQ7(FN6SxsA?@O#s(qFY?*>WxHOHD3!
zh?I7RKaki438^s3<@7)0q+8O+&|ZW@IisMDzZ5|UzNQV}6P@nJL?@ODPVMWCWEqac
z97E0TOG-g8$VVw2$WAbv(KagxP&)tA<?_GA;Xqt*6Da)(@|puTx%vc(5c+i?k;<Eu
zOw{u3$RMXT;=gwUQi+zLP)EZfun5lF+^b_!j}tECYDni?OPV{-h8)x8mfsqVRLN})
zJOEb%=eyi^4R;cZG|r|e=)f6{49m{z(<LZy!~b}W^Gu|{R1ZSbi~erpEH+UVac9l7
zI74Tm+*=YRr0{_@1u1b+u2oNTTHS$^Uar75!p(G#sTiia1CPT4kHZ5#m^s%5k7m0A
z%<T$9`an`B-o$y9*axIrocRI2i1f=62J%CQk3%BT*Y$EovJvT1k>t@AksheCB8Bs5
zT|9;VG<+#ic)3UTtq}dr$8+3)UT|lmaOc?PTL&Dw*qI!7HYY1|X{!D!3fX=*x8I3L
z#9vbQR=6bIb)c<t!ZqXqGl>TvD)OS~i&#&|bOmxzGY39%nF<-f-i2WAawP{o@#X{<
zB@g8Pbh#t{bRlKaGDOObwYnp<usQm|_Rs?w3Sd8xRh?%(Vqr&;;YemQY14T-Qnuag
z4FrQ40%5o|Qg;RZA;Pc$uDt3sYioDtQ(`A4BE~9E7CypzoGZwt)K4-Td)4CQ`F(g<
z4$pEauPSs;7S;AbtIqr`Wti*<T{K=s@1Z}F@~Nd5i*J#Js-$sjzLx@uQbp1d>R;?T
z8*RC+RH&Zd{9bYuW`MJldu7^|Ku?c?Ve)V#p9+fuXA`fGq$-k8=f65x{e#%$PUUfR
zKrr+N@p7CmLC=ZXAoV_eRq16GN<5orwH9^8S-hZ^1$t+)62#1MrgjG72N^p@rgOLz
z9;kL0j+CBFNJNefbG3^?WQEka1o;k&CwOg+w1BZsbtP4GhTbSu&tU8iU6?8H!^wx3
z=`mZ5@K~u+$-{q<CR(6A(WzVlBaR$6&$o++p#2l=D>?19MZM-JxpbKPFXB(Y`cyhu
z5kV+sY-1{YEZN|J?Tp<nCUFrQo+R<}l4ZnNNhv(=JpLhLWy85A^psb+hS&#AT4Uje
zvIqJLU;OA4{tF|1X^|dvt9RJ{Sns}}p>w>hz+En7#2nhymtWJ*6`0T5t+VDNygHJ`
zOZwE_@W&_><CJ97_kqip=$-19jkNH0lS^xg{f~i>K?8FwuOcOsA(ivjk8()vGU+es
z87-~LnrjEw{AHL;Nw7RWjn19J4yDqM8T@x9og+({EL|o^8IrV%OM^JSwtyD&XKa)_
zf3hpER+V2JtNbw271*N6y@)+GMZ?r4?l1XEo;&yeSL9KyKwT35iRtPn?hYg-vVK_j
z-K8iOjdBMjr($tW{wfbM*L*!wyd-@$j{ke0EAXA7T#l^(A={giS+UArrQW_!i`5xg
zSZ2R%ywp!@GncYeY1>@(6}fB9!*twPD$6TWz8y*D$-EIWdlwCopPS_kRIVFkwMv+8
zln!&t6nHwD_abZK6ha>Qj<Ne0`@3pgJeO@Jh9&Hh)#M4%On;y<oHN=I=DB?$Pb7`T
zl63I^$(}`StnqsX210!;Ud0_=pt2v6Xbh$oi17)t+{Jt5CR{no=|4%wyqb4l=^3P*
zO#Dw7>xEeJ$ti}Ir*t^hdrwY0g{ZtYJB5b&ii;LtVZ+07i53u_G~b3r)s!SYf}Ev4
zkz)bTv(xDyR(1WS%9cln)i@!Ey(;<$xcSah3<~X65xvPc+_(y>{=zHMrVcM27=Dwu
zpRs8k)I5#?#{;DJSw`>y;>!)Upm=_&)y<rNA$>68i2k)}w)z9{EiU1c5!5Sq2C+L`
zYy>3?Bf6DyZ0{RJ`e%&TW;Yv#W%~=6wu)FGV{mym<%FS;3)ui)U?Jy!cMDq_lv4z6
z%7fVef6-FLONHNIcaDDq5uNX150br(u`69s%U!`ICR*ySf<dkYS(M{DiMYIqXBJOn
z><|Vw#vzc!TH;)6`xO(gIlf(tQKp;Drhx-}pGkbM;V5D@W2^(;^CjW4U(|#%xHHf<
zpD!T$7ZNWueMW0kjL@(+juo&-AgHkCWZPJc{@~IuMcwA3X$q!3oQ3yeU#p`0)lZM0
zUs5tj6|2~xY-NbAsFy<QZ8w%{__tExzc7#Oh8HtKOE60&BG^nK3Qxi)k$#w`dl3;w
z*ua4Q0?sB23CxBD0z(~Fle|>+2127GhnM7?%D_PQcFr;kZ!REx6L!NCyV)<~3jR$-
zc92I)uHe{tmd7NvNXSIp!SaHeEg&pN|3d6C!|Vvs@50pV3_Fr=U_dO*eQRSz^y@Z>
z(OMU>SV9DGq`?-EC?yZUhb?keaJ<CFBnnXr=`FD0pDuO=Wg*Sj_h#nMvq0{jAZ)0S
z_;YsC-Xp~5kQ44@1_~|wM>DncU6?fw*7D3?qJ`(hY`9rrSb2iQF8}iuOCN06FgTM*
z&%}|O%Q1CKNFOe*<Awl|>LxqQL2wgC1%mzyu@ZW@#QI_oo0#}7PHU{}5y}euEU}<n
z1P2lKsltT_vcPnwh_|1mtl-a*I~26AbM2#$<X<U<I}1p8tR$=G@e(RS|6ZYcl?tSB
zG0u74p6IxoluuDX7)di(&J13o@v9979HdW19F*DF6gDK}+aq&&-VQ$}JgU*<!lPKn
zr+?cMF=sQzE(&wW4T?zLbxKz8dm2B`%Z%Tpuw-@~lFn7U$4K51mR0<e#vgE*0^5;R
z*9n2l;OEuuz*K|YM*77tcwsT(Ix93^;t$8f1$l^(v_mDn)u1A21E^EzYEhW8iZdiy
zr6av5H4!tqq8gEXEISCoy<9{cCY-;An?E~5qK7btKhVW+$0hQ@5hTZ8IV*%@e8y>d
z?;Iqf&q&7N9Lcg&=7UW)_0Lf_N94gtSMXMgHbN8=GBTR}tt9iG2O|l>B7vu4(E|cd
zPG|guL}#GFo%mV=f1H!{$T{IZVE;{tB5^}V+!eXPhY0<XiX^>IMT1GVbE1245kFbQ
zKTGsMqKJAZPIl{$Ak0P{ahASir4ri!Y?-prD3Zc9_K==bzk9JX+$MH{ts6E5DPFQ?
zF&c}Jb~>xr@}ON5&RGtA2eHqjM2oCUcVv}ZuDMGSL?FzvEw9sEB`+k1%=ID5D!CWz
z4;f#TH6&1)!?9Jx0dKp)p9=wcbUMyZW`(zD>~13r$e`!p&riX54!(e}4<KOyB?sp?
zGQ$sOuHe;P5lm#Uy%I#Y*T^12?dm(2%6dK^cvq8tvrH+$xej{^>Vu<(0hIWkISeG_
zBIz>2GbJ~y7wH7XDd=(pk51$x$R3b#gT8RGnShIBI6TjhU5t#dJjIm$ia{D%RL-CQ
zCBam?J6IO82Sr@Tg*OW@V|ZfBRa^*=9Apkx7+K&B#-`*5FZLrN3(WDdhmg7VSuDt5
zrzav^=><#}1_Qcok(s;XDS1fHk0=nSj8agt&&&!UuhfvfA5FJWiuH36v2p8&h>OfH
z%5m^9ql!X29yH3aDCp<QiNhsI>y0S2|BPtqZ+HZ;-{x@4)}`JNxLaN2w!JH-5B+Ps
z<q6DwS+oI)a-g2h#4sNNpXs9PhX*Kh{z7dW_LQhHzL$jgzGPNwMZdxJSTg%dhWc@m
zzF%Ur+d;piIE!yeRVUQZb7^!#kt;aG6%6`IJ|yhfcvRsJ&6Ad-(GH7lOUq;fRprBE
zHcaB<NqT*jTA!SUU5Tn~ITjFYIhD3FhqU=5_6}JebFu;%LVQ+AA8gQM`ToF{ST6ih
zb%wup%4ae<fs}oWrpr`bo*a4#t2SF3k#+r&c5@1SYf;9N<!MF{75=q9&3G?P0}|<;
za{8285c>~GyqK)-rtlkV3X<fcEXN<5Z&SU2k}Riw6y5KyQ}_!66?Wey=o?vo$`PKC
zjxzcF2Fpe?<-Uy!E0|Ilv!qKj)2?S_q~z3PWfU+?(f3vKr4!QKMQ;}Hze@C(9a9+2
zKwBzHlay)?HK=qGOA5T+*vA(bYh_;McoQv=UDsHuekXqX5S-_x7nbU?79{e;nYncX
zhboB|5?kZeFQindGR1*7`9jjalpOMAv>Sa?5XkP-#}nq*EkZNS22GkP;52mgQC)Db
zCQP0!=~*nr75v@cE%O+Xp<e|LvkpR+Q@q=WBDPPk(%G}t#7l_2@#xR-bXN&}lsq0?
zj`5h@ec1-3z8@o}@CZGoEZuH5l>>c6Rwt&<lPF2cDBU~8@~hPud{tHY6TLoK%O=Vc
zV-~u(mrx&8p_X$_qf0b89la5p&wD!w8=V3}(G^RQ^RXSLrk8{X$}$TC(*xdocd$Xw
zWqq2s$#=OhpB^~1w>KEtVujihn3E$Ie$CMITPntJd1%9OXfL<DWT;=vnG^{IeNo0M
zIVcsYlP#kded?hd#4(eL)xY2u9=Jlg`shcoSiLcay=EKY51dhL|4H^^gF<!Xs@l56
zOIll|MP^pctUOZZXm8SoV6WaiR>$gUYzcEaRz+cD6jm7E4i2xj-gBDt%_42@X}Q6Z
zo0IsY(^%_ReeY>{ivPfJ{ln#IYRS3PY`s=I+!y-%G-sqvY^`~vnyo=Ew3@J!ZBqii
z?5oAL`-sgQN52hbzb)qG_L>c{^t9dBpLjBxujo#m#P?{{a$>rUT?WaNFhPAXUs{K;
zrlPMSk2E9I2F(vDm>&9SLISoRY>P@ZI`vm4NIbhJT!XH$oxPu?Bo(M~KD$R|*g&&d
zQTLw6E|A&hlRf?{g}3!)>t%+`%lYValazt<l|_~0QIvz#v@s34!?2(1$}IKS6WJ<R
z#-8%&y0VSf4w?P5(6$X57t&B%sY<6$Wqu~nV+Haw>R0UBMjSstghdsJ5T`|HT<mE0
zd7`=%GkH3{nlOBVnszF1&5kWOzo3>)liyM$5Sta@qs>*;voN&8LIySYi^IV{D0p{<
zii>ORNTfP!du+ZfQGGLkdDEn?5>$*OzDz*R*LD^0$qASQ!d#|`K7LUepMn+UNBgVT
zbFx#|MT0B~7$^2K_E;Ruu-s1UcakQN^+E^BNaG)o_8aQ+-;8yJNy3vX!6mOHn%Q4@
z-9eL)HVk`+l)}CYwZpk+DSu${seReoscbWM`U{wniV=;4>0BCN;k%hD{3EeteVolr
ziPjej===WcG~#oI`<la{7Kv$va^WQ2zO;99xIvP93bn4Ot_c=}D#kq7fc;^77?bjd
zjnmi{laeCOHHrkfjhL5y)QTWhRO<Itj1C1+b&nfYKFn5AxAGiZ7ESaxF`P6SNM&O}
zzA)B7a5?Dh<O-~+cUoT_B#vs|)@Z>F*0(qf)1={63F>#T2Vvw&Hj(s_CAi?!LlIdj
z9jyI0g<a@jV@bZB(O*cZorgl|3SxA8>jXMW^+eXVy*2(_F8ksJ_5>o`%?1pn8Q58N
z>$yCGPA;k$Q{)@O>}5n-u&paBTZD>>U<_`xoyz#I6e>OURfNSi+BbGwNjO+u$t<i2
z!{YDquw%;VV$R}0*aFDyF2)u!`Kwd0KB<mr;;HBo1!<+n=^xuf7;z!eujZ0ylUAtd
zM0e>JX{{}a1S<UgiqbbZzn@z#I-5PEI3xGUV#;DIqd)sQtuuSmUD)=OGDMj@hjR1o
zD2j}(7*kqQF}D1=LDFxz40*RdvA>*456S#4B5&TEkO-7xUw&~IL)K@-Ze}z!Xxgmk
zETc1Wae2uYe;_y}gnr3^K{Dz_5a}+}yQl@T7YMh92>;H!!IM*e>T;?HmW&w}EO}hw
zPb9f~cP3D8rd@zrq!e0CY)i8IcTZ8U(sxM`yT#?jfd`UStVKt6bWx<F;xddGNPf}n
z4OZrHb=`7j@UrDj{FN-vSM-isDrWYniMR=n?_sQc7F~kjGo$$f)fp#P9w)7a*dDRM
zB;B3K*EUdM7SCl$6LyJbv2;oc?{H%hj528}Hh>08g5f~%!6D8S7fN)?H1>{_f&I23
zr4pNHVHq~bt4P>n{BMi+;e1~$A#EdH%j7GWv*<31G>{!t14C^%LQu81p~{=rxNJpZ
z^RoK-ypa;_uXdKcZbiR{=FP~fs&So<;_Iqu<(L4M_Alj22cZh#ZxG!=V%lVtLwR92
zA@^*>SZK@$XU!@L9U<w7LD<ZVqxUxS(m%owpG4xn!uWfr>Nx$dG2urK^;3Bs@$p>#
z9;?G2EVW$Sm)bach*@vK_Sn_P)v^|$gV^GwNIM+-8--O4WVs~WWF?6))Knj5p}_Iq
zRMxb?@~U0EWJXSr7gvM^0t5Yn(`Yi&Heu_dEMJV>Q-0-1PN!tkL`|t|ph?*xTO#gN
zNt>^*fXoWWR?ldYSiYq&vA1S8>u(y&{z~j!a{3<WL(icx%Sz0`GOmm6Y48p#jM(LW
zNv@WEN!YtgANA&!GI>YZfMDfS>2%g87UJrQSlm}xny3#8gGwznVAt}MREkYJZ5lsV
zE4MD9xpswH+0!IW>h4yt6{TomZ|lcs!?W4PnC>b3j3{8^i5Nvi&;fQ}k@`cDaK{i7
zJ5!aB#I-)I;P~}o_U$zVGg`DzRqHjrriQ;l*6Y)F`-uuJlP}_2xeVK@vaz7!@`p%!
zme_dChT|}~q-79mV=Ncj!HHcfi4FV(5-JjBQELKasLJXLY&WKhdhvfUPw3QCWa)1+
z0*%XB^L6g4#m0HRE7(}O_*7yR4;$8G3vdL{6`Dh3Ew%Mc=ZOOZ%}uo~--<QfK20|y
zSl(y+JjT{(O$ch~Fx%>(Q~7hqW&POqlI<(WGDG#0_Ep>^(^cswCWXm1N8vwWL^szL
ztSp^>0Y(ba&8UDL=}%LbJ2*gt@A=y}R8bOK6sdcADw|cJw9KJ@_9y&;&Mxr{8a-kq
zr;QD@(^D!AUhspGgqxub6tZlZQ&bZeUGbLHIrf`@@_16-wy|bh`%l<hDUHNbAoiEd
zC)>qL3UXea&NeY;1f1y>KAG5@0nVnIC2KZEg-X57br&tck5&|3xY*U4mPn|t7okJF
z!Qv{;LO&tP-4xEGi!7-_zhtz$7gaKM@s0$gHIdzEXJ4HoKdC9$s6Ig!ZE`D`8Iq78
zUb4{hI1Faxt2LU-sh6e=#dX)%ZAhMyst~5($falv^Q>o<2m_c;Ta3Q$Pj)5ChJA?3
zQ4^L1#|A@IF}YM)D)sgSzRQv#sO1tbkBlxKU0hT$ZtTTIWxtg*MFJ&bMwgEXSgmVu
zECUrrGIs5;jj(W9e|93v3Y^!Uha^WX^9F;r(z$`L{_^tj(W4V+;h3UuI5e(aayD(V
zT9GMVktB3CTPQu~=Tv8%4u6cRV8j!UM&v9nB>hCfd;phbi!6^#&I7Dy7gp<pvtDo#
zXK&Xiii`70>zJF<L5KBf&Z@YmBOaEOWxbI5Mtd#TWE-4gOCEid4IM6$kSo^XHW9|J
zwe<@PJ-HP^cO@triLK-O`Yio78!5MBfjGr&-EO0~3Qwnup?f%g4|mne?0nKf#Lzto
zVQ1tanZIJA;ST8)oBX-T&nqWXS-rB=M626Z<%L1J!|L%ZS5b~+r&S*3bosXTW?|BL
z+3A)<DVwZUC@O{*Id*<B(TxdeuAO#d=wa;Dy~hpU)CWT)NuFfS@rCV)sU>0i5MQ{|
z?qOfnp}BV!4@gcxRvjo~X+ZuH8GZK%c9^8$cJ{Ch8@K5XC2_djTkId7ha-g2PK#7R
ztV+WG%XYx(EVgmfwsJaQc7f(fLOnonH6C0~mvMSmLKQBhN$Ov$>=(Z}%FYMa=pAN#
zn#onk(n(f+3ldzWN11*Q(>TE+Hc#L-#&09q)`#BF6qk*cEk*b#4^@dp`@lx`%BcF~
zpNONgyOApwQcmz08%|)P(4#fd^@Y+?#8K-iGJU|X73|+Oi-oa!RVtC#Osg#-vw<4D
zh7G_t)AEVJw)Up)b?G#XUP;C75cL}@SoA?31v_x1Le94kzbA!nm@AGGXOq$*(Tx%m
z6jIoE=UZO2vZI+w8DWVe;Uzm=?3Ta61v+-=Obc?Dz7hv6alqmVLR+Filc}g><SFj0
z(2hLDCFxC)uHtm_=8U4(uw{_gYh<(7SyL9OO?nVNmSb$WAA9g{+*(~`m(G$gEsgno
z`~g~8AyXelr*ida;$bevN9f}AvGPjJu;lO+ODV>b$LSV}&CV<IQeViIVrAvo&gNSz
z_W8t;8BUjAJ32kCIak~+;S#Vpvfb|V{i>?C-S1rtq8nKzb}4>LmIOQdIGa5oSs#|H
zLsa?J1dsm@r*sRKhmm!+q@soTMJ@CPF`#(HLRS()7X}wX{=iuALLTJw_f!j5|FFo&
zEBV-7Y1gD;MS60Z?6TAIqQ3=_8*>C+oHi(=>k@dSl~tgwTM@8dj$OkXbLa9E80=Xj
zeKm$b68ch20mY6!6t0Ooo7s}>IHEv{IgVMX$#&M7B%dYv3C^Z}*m)i6TRij|jc;J+
z^E!QFNo%*!DKg5lZ7<p?g_meHW@lKM<aYJ|Ntp9K;IT?phW6y*E%FSqe{ExzCRx9>
zCHV*TLm9qLvNT}D=Ep?+8ylyBOn?Pz@^DW^7qBjq_hnh(Z*6?8LOyhJ1~BDGMz5#p
z-$JZLu^vT!cFSiF`c%|wj5dvUsZIaRCMQnAo=}f8hootg@X}bC>zDfz4js~VxB7#P
zzwg2s8v0STZdRp$DExevoZ?@YO#hHA8yHQo>OaE%%VcqnQh(xy8LMaBK+yq3xj>~~
zRX#qOc9VL8%mzDfT*h6JBC~!T>76ubkCisyAeTjYPqKan8!)YMuI&>GJ5y3hDK}X9
zfr^Tfzkw40S@Z!`4)5(x-W1WjE93lmJKYyie^vP7^JpoSYq%X%UY^XqX|<yUIzy44
zCU>aLZMzU%hyr>q3H$CXP@f{p3m8q)m0QQr1(`BRxF=1YyIIRC4s6{dUu;GFEuF-u
zw~UjE($sMp&M&^(i|u6cVhbN>rR@?r<-9?mwd2*F^ZYA(#V?|{-)Xa*y;-VcsiiAf
z!rpXxJeys?Y&OR7v!yE--8J5d;}@HI$$fG34$bO#u6vk77qd|~GT~u1@hiQlUg<X`
z{e+5~)9A_k56qM~l=li1)G!-Q&|ln~Ab)wIa_fyhBm}Fmtcd+{OL4%tw!X4zF;34;
zJYi;0r7!qY@B8ZOLBsI`U-2|tg;lYvxx6p^=)%FZNL_t>i8%a>RY{2B@NiA-%DU>>
z%BI%li6gchmB)>%88@ze+_;(LmH4ilRa7}^T0UE~F3%hOa<g*jP5Ld=%oJAEP+4^H
zv+I<F7t+&QKY^{05QSqey=-tp)oPr1uJO93PQ_6a->w@_+-#@jr-WYF$P{~@;l(bJ
zUe+vb>_pFJ&)>iv9b`SBH-EpM-oVe0au_u#%A>udgBbBVk+0uP>zo6RCMYN+E79G*
zLf=ap*m9?WUxOZ&l?U@_zREsS_|ta2WR=HXR>QXE@vmiCrOMY`N?)iL^Pu|llonF3
zp{}tIgM5+$=+pnGNewpImBQz3q}{n}SB>REhkCy+K|xclNMrp{DA5_JUZ!52;0ax@
zF%YT@*;4qmm`t`w<;W;`Y>kF9rIaFi?eY_OXb@P~$_7cOBj}e@I|dzn$=ULyN{jQ@
zO<VPe7Ku-=Ico;n9S)1McWru0ejd}EH1lTKKh>@>euGu|$}Od3^S|lxH(uMnag^+%
z1m2tM|HiouqHi0qK1eSn>i@=gdlUf@-Nj>ZoZ?csLX*C;<oS!gvt$I(P`#)7LzO<&
zA)Rq0-Hx?&lDFHui)$}q(jPhUGpNRH0>K^Hn@yOh-^y}pd=F>p-!X}Q<3=lCb1;^f
zu6&1+s$Si}47VICf!&!*(+Ofsnz@qx)Lgbl2v>a;tqAxhNJ-WqIdXvw>zuTXQD2zj
zjGe&<*CRY*S1>N##>KANt@)NqcJk91d&njYVyM@ox3=4ca(;I*%Og7|D@hv4nVO0n
z?Hnh90Ev8<m3^F^IFzUOu-SVM&bRg|#9kM5<j;!FS6SKGT-DejPT=@TRy0;MpYEcW
zm63_lCXccv%&{JJ%`B-LH|~U)h>l?6vPvAxZ>+AXuWYSa3}R)tuI7O|3R+L^ONacv
zs`^z`r?<eq6}6R~?ThNFt6FQvZMEi%-)gPW|D<!}e9d|C?_T|D>r|bw3-qzJ63c8G
zcgVTJeR-E#Ru09<E8ZT|tEJcE2-n}Y&O=_lz<Rk|`p`x!9WnBr4ZRsf)3MONrrGGw
zRg7u0JDrb_s70e+^BF44VP18ZJ;^tOEW_=Xo>xNXg7(u#+L;_CeYH)JUe|I;yrfEf
zHSKIW-EWch&#^<1K^e{8y66S$u(Sz#{%xoE`RE?={Z^5r*ldY@@ikaS)-0Peez7K(
zS@|bc**PGi_)EJ7uM@8t^w2UF-)5%)ZX7>$G~0Q&FD<txhKcSjLC+_5V8~yJ#dt9(
zMu*YQvbl<H#&J#357-ns*=oJn%BC@CIOAWDESLvuiPsTnI$cNlRTxCn+6jHPa&ixS
zlG#tP$xkx<B%6cZnxW6NiPqOX58XDFh{o@`zOF!+>3_u%8pdQZ7`11JDWA`+_NM~4
zHNoc_*U;2@`s7s4xf55Sf?C<n_HF0@E_*Of>)O!!63;zj7NN(Di%UW%vK=0lDGfVC
zArucW3*Xkh2kNz+w)VL=Fm))Ut^JS=eQ{g+1y1mRtHIw}($?N&A=(GL0JtUG)*jYy
z`BQ0IyZGta&Y^AXWx&0|AO}1=ysdpFkVe8@^!dG`;12LG@Br{Yd0RWiN3?cyTl+NN
zPT&&Y-Z5?MJAnJfwza<?=;PYjj{*->w6*tx|C|$G9{%wHKL<_&VzV|i0lmPrz%XzN
za2jwYa0zfPa4m2ja0l=Ja4&Gn#J2W>z_pWLj|dmg3)~3|1NQ=_0S^K90iBZ(KR}v-
za3lQtfJ=bWrna>|3_Lsy{uAMz2|m(cZ&O=4j#-j(Il>DpYDWAaev4Kjeh|MaSHXSY
zw$%tHaKRe5?;^SmSO(m9Zd?08;J9`0Kk)E+m;<ic0RI3_+6eyue+7!(mwo}v1FL{#
z!25s;fyo!bJTS5e=7HOR`+z500`tI{OJN?fUVmN&^T5U1U>>*wxDfd3l`sz+b`8t}
zYk~WKmjVv~6Rw5%B$xwwf!_klfR-Cz9(e0bFb`C3hI!xt;6C8hx4=9wZ#T>*WAg>j
z3)~AV13GVodEk8DT3{1!C-76?KHz{oFb|vz#MwPs2lN7W0Ly@P0~Z2c0j>o;cN@$D
zKL+jtzJEK+1Ahl%OFjK~2h0QAcfvf-b{EV8^MGrC<-nc5(A_W(TnIb_d<rO*1`Ytd
zzy*JSdEnoH3xNao!aQ&(a3}CP;6C8Adte@T8&C`}Uj=%B^X`Lr;1=LQ;Dq~O9%y|4
z=7DcLgz^C#{4mM^a4HZ}Aao(HAMnpdkZ*yvKZf!M9P&h4`xfBPeaQE~&A<b|Lr)>!
z0Y7>M`4bJgZ9noo@Fn0};4jak`~xSwfN}$T3-~aQ{SEmRI1YFecp)$it=#MbFb{kO
zI2SnYb(jbK0Nepw@dnHTuLd3j9swQ&Rvm<SjO50?1M|S|fOCPLybJR{`8}8iJ^*|e
zxa@tH2et!`0x$Xi=CPn)IRx{-1;Dw$`+%!~PXTuT8~zRRz|4<f9@zK^%mbaD!aQ~%
zoevBHKm82mfxmtZ^T4%Vz&vovmq<6@fv?)y4+G1-hP)T)2%H8K53;lIurp>5TkT|-
z2^MV&v*70|Vq}Q2?`vzv05<+~FE3VmqVi;co2aA6;jD0^P15bFw6!!UW7vQ|J|+vJ
z?u`Y8QHKh#*dyFJ1#c1P{RMHX!?~VMw>h-Q_A&UAuW)w_-ZJn}1mcgNZ^b*Ii+zHA
zCte}n(H%j57H?Pg1^s=z!hH^mJ%WA&Z@+Hn323mqUFaen@<C4n{Rv^oSVt3IhG2}d
zFLSIO>sZSsJ64xFng)S21#){K_o<K@@7Til76?v+;0`v{u@jOzSUJdoE&Q<tswT8K
zXiDObpl=0zNjLO6L0{N~F5>xF(C30)AZ#fY@x0LZSF+Fw`#uIgT-4T%CLsO<q51>#
zBGCUXsG}XuO}yNZwm}-}@UEA~JBoO%&EW+FHjWm-Fw6?~Fr23}xHni(4EL7A-4pRJ
z3;cfIe<Aqe5pcc(@lcL<*n@c3%T556gDlu0e$Rva=aB!aAdDCOpAY%*b|brtkGd=D
zy%TcN;NO2n<q*G{%!D{Teuez|LLP1TB|;wl+#tfYUYh7AI>%-NtK6ZDjaqM%4I#$P
zm^Q^IITq<~zcSoiW0W6+2|VG~EwB%xPm+h1j23ZrDIe`fyF?o8@NSkzJNj)>#yW~N
zsO66EddoxvQF@En9N}_DQMscZWFQMOh|Ni+v&Ow$=fS?Y2v>g*BjX+W7=ObI*9(ZR
z1BkB|5MTR@_!8l|U&x~zv#31c_jV(`x{F^S|1spl{<ikgsQdxOKZX1lq{l(ne+c#;
zg#8EN_G8|LCItRF`-j8+v5-$2k8+0i7ICs!#K|V9+~M7TBMJT1D-#`Me6!8b4-}+8
zGzKc7G4M59JrCwTpU~ERJL=MC`Y(t7CfMix@B9CY7?_1njJ40kx7;r3Biw95durO+
zXG4uU3Y@&mp^1DX%E4jKGm$6cv!ms!o4m9Txua87Do0kD0`o7!-km5H;vS|BWpi9N
zWplD)8ctw#S2D*$^Q9>3S?Fswq1-$Jb%*+6io?tAaHOF@6ZHpJxO4%x{Is=W%a7si
zB_jWCmf-Ftk^eU+6^^3y>Qu)x=_1qyqAEZ^h*t%|_o!3xZh}2~gBYs_0!m58`uNOf
zjfJXXGLGxTD&@&Wd_D_v{X&R0Va|Mv`VaIX(Er<dA^~FvC&pa=tvt>Y(qYK|xAk#7
z<ac5W_utk-*Ft`8_`j?d5%(`c-mA2=&qF?nwo6kSY2!syqwXzQ>~KyJm5+Cvp*`vU
zdF@F?GL1JJo!?oF)Q!?lHtNH;8m&}Z{zTDI2oF#Hzi+mqJss6>cdF4=i}vh@C@+iJ
z+W$a&$=iD<Z)m@{G#L4|Q;|dCyw|aZqc!Jf&C#yz6m>tIoAJ)V*z>5`*1i>eRFRL(
zHWDduTpKytu{Tlr!0vcB5$#^$3BYne`I>d5Sb4XlwcIY9&%4Tgcbw_1G0KUM9Ss+Z
ze$gNB=ixqPzX`>80b)j!Ls1ScsBLQ(6BzOv(ReY-A_}hP8%<l;v1>FLC0O)}rU0*u
zw6$Xz)vPbZB&t#dLl?s6JJf@}Z8649X8zoO-RSDL^^)|2Bt(oy*s~RKp`{pu35wBg
zf!vu<5y8J7{Mq1tiFPy2mp+k&gy6pceom9wUyYgPe>E%+{3GB$4*ngT{JGokr{E_g
zp#8^O!Q~=NW9FsJJZ66o{6|_*-+>+H^8<1FXMw+LMO*t#o%q+q`B>7Y@4+|v{W1He
z8*Ib>TfzSf{286hPmP<uAN+L8Vf4g*1N_P0FY07|LEQWi@Hc}$y<<8^Q;b9~!jCzh
zCr)o`_jNR1f6RYD@LvW0p-%C4Z#@2Hfgd>&;p@bA#rdt^pR%T{9ZP=4;!oNdkKe7}
zzj=0B`)Qs0R~z@={ov=U>)C&AfL{cDRVVjPiMxLU{N?N0+DCWFKO^G#Ckb<1S8O!L
ziLvyl?GS$Oulo=DS>WFa{#hO4kFSo$e=GR%1#RsocjC{9^S6RO3jB?o`0L{Q`@!E0
zetM_)ONz(e8{pq?5#}m7#osmY_&WlA&LwT_m7U^eems7}kF8I|oYae*+<z|aeh_?Z
zOV9jS;NJ`W1)a>VkDG4=KjX5V^XFFZPXT{WNArsf<3{<oAN(cYFYL(YC&%r71N=SU
zPwK=U7v~=V|26P?Dt}3s3w#0mm7UBlkDCvI-|O<WcG35Vm;ZB)`EM5ZqruPZ$Uo&6
zzZLv3;FomdHy`6~1%E#HJ^Al`@TdI;{u|&|fG?V#xc}xKv;PSA<G?@Kk<WiL1dQ^J
zInC@VdNv;fKMVYx%+CV<$bT^33jUA(!TeV6U)tKU`}c$YBKSSI{|5LsZNr>W$MDVU
z5PtBl2EV8HNy41$Rp1}#Xr6x;kDnm;4Oh0c5A4W~9J7BG_^ZLM>d5EO3?SmS75u+~
zk8F4>eI^|<zZLvF;BW58=NH7?zaRYM?QQMLJMp6xw6OmT@FU<a=)|8JH-7~DUEr5>
z<acOalQ5_J4EX1D<nw3-7xrVdhWhU4IefFgUjqKTPUdID-ERf|M(|_vHAX+6szdm}
zzZLw8I@M3-$Lpv2!FOHVbNP7#{C?m++{yj>;_e>-e;@cyb>cr}@bf_vKL)MB-1;jW
z>sQfFdB&(>g(;&R8V))2ddyV{Lad$O(TpbI=@jrsg5Og<TLb<G@P~IazbYCJ!XMXy
zKM(xYuKqZ!vp=4N+$}rX+VAQVUwh*5^)dMVo7&pX@5Eml=gXMG-@dD@9UG&Lg+o%~
z?MXiP>uzalpV`r${DgRXjRpS!!S86lh_8u;6T*}cU-ghXw7U)etOFqyUs7A#ADh5`
zW=~st-%k9zxIeHEOfO=spu2Pv@%=LROR<J9sG~omzH$4%27eXS7P<?EFrR=5_jd3%
zK^^xW-x$v~gTPP08bnX!ehT=x;P=!n)Pq0nKkzq!UkAPz$H(25o>zqjg#YdYKl%Qi
z%jL`9pAP=oPVskUJpR50|D6ZX@9Gr3t?}?BU@c|FLv8KrJMqts^9O<d0Qh}6@)sIx
z!+%r2e+2yb9r=8A+<ZOwN5F6B)Gt{Q@0V->zx>g*_K!N5|7YC%o#4w@Q@NuPe|Mb!
zGWeH*|8pn)_i_H$;MYFhb9qcagHZ$iJ)O+o88<%&{GY&oq!a&vIDZQGmp*~Do=*IW
z<NSK?+rhuM6MsXTzX|-)o<zT~Q~Kd%{aE^g|2)===5#V29XJ^IAN)W61OIFA15dZL
zqx*L(e@!%;H_|U59r+jhJ3I3Et#SVi0{<@X@9yNk+v5J40{*b)y3_;G3Gwn*4}RnR
zp2tO-z&{NBican~#ofOX{Bh6s+^>Hb{0%Rd^I!4$1zSkW`1>0C)3H{zrjz~A3Rl#(
z31UqSe2ZCstmh|+$@^Wl=uDSg7!!TcAmm<nv8{bryq>|*5URUG%o2z>v2uI0fkIx?
z1vQXA_)=SYtsoqm4>&e2H$lvqPY`qFQypc^-Ob7w<MlgX550W+dd{<;ABFwKI+Jk2
zeA?UD$O4r4$C$ERr5m{N50#$h%C#!pA}LR+bdfCRU*l*r-L44o<EkLPtO~))Eg-{&
z<Q(9IO!*OKdYJMp4g|0ggApG6F3RzHvhs=rN1K!vEOZZ!R?|0%a-D^aD#{KEJ*|SU
z-lF_wp-U{vQ42j|3BhmeQh%1!G70<L|HiQ0NBWg1P~XOt`?&HL$D{<}^{jJZq4*#E
zF9!Y>1OJPG|HZ)nmt$bs31x=&rVTK0Y7~o1y1~RpG@0V*iYJObMF%?e+ji1lgO)@*
zF>`lx(Z68o6QW|Yuy>iE!c?E}Op(eBL>It#_Ka9$U~J6@T_fWWO{{qKnS9Z&5|24O
zO+U4@FBAG5yi5R@&eAkTR4=;j;yKF81Q1#?;}L5`;xRfAz?gd&l5~1R*VuT<qW&|Z
zfi%-14CRamOIyZ+F0Jv*O)oPrM&Fwj)yMq3$KW3OXJX9#w)SXv)}}_~@yl)_e6esG
zHualQqI}bu|JUGvnGRhYywXhH-6lS0;(im~GVxOre>PFgkA@?~!~zqGO{_3+j)@Ty
zSDLuN#4AnQZQ_F_?l<u*6F)WaXA{*vru!xqm{@FLg^6=ajF`C6#0@51Y2t1ZA2e~l
ziEo+ssfj<Es1}&+n^<6Cv56HX&M`4!;z|=Yn0Td$yG?x1#Qi3|W#XqM{%oS!*L2^+
z0uzf(tT1toi4hZ5nz+HlD^1*O;)5pcH}NeKKQ-}Z6V-mE`z98cSZrd2iE~Vhn7Go!
z4JKY`;%*ZkG;zO)Z<+Y1i9egD_BY)(vB1P)6Dv%dV`9X_l_qX5@k$eSoA{uK`%Qey
z#7|B9*+g}K>Ar~tCKj7W$34uT935P4;P^3PhI;$WUbLdIb%i%HI5@bduy}=`^PTN0
z9$XY0?DO}JGH~-4?m#OpDm4a8(TYqnESW}8%Z!26aSlcYSvVzZiB-D`v2$BI#(=9E
z`MHpk4Z6ngUo1+%2>=qK$nG5COuG_noYK1mq8xXx37uwnKmb#B7(8V(OiQwY-|~T{
zWPm7t1aW!&sbI(-3H^CO34`+iil%B82`y7^5EjTX{wxHh>;(~jAA!U{VWm6}AGPQK
zd@X(PIcWJFM>I(@2tS+z3TeL!<}c?7rFKxLj|#O@)6q6qeiiC8?Hkx``Aw+3+O0z7
zccJd5Jps!se+YGv_Ozh?Db!&tLD1WTy3869_7D@A3D&O!gBhA>){rp44b5EZt3o5Q
z(;%?WDz=j04>e7ICDxaPMq|RaO*qvDO@g6mwTjls;$XFa)z%W>M>h*Xv({Q5#5~wZ
zkEulKYl4%+L>s@}IzVXB*am1Wu!?5MlFOO^TdeO3PJgxznjO{)g`)!u%}(nUp&4js
z_E@(I%^*Xw*LuCs3}!dL%){b#K(Y)m{I<_3cKca;Y!d{YB^%D_ljTaw^B|^Kt`&jQ
z)(E552_<O_Lil<STumDz)HjLHI!V<^K~s(NvWo;4-p2nD#G`5=DGlniF{%gDo1jTF
zGKZ$%z@f-8djXD(_((azUg-mf=#`Z%&<myWVSJ>V5Ak)8TO^cQ*yvb+55HXKB?tWP
zEE2X|ZkV{)Pzn>|SSQ5l4Kb0{QL*}CV)aq63BvtK!~F$9>~wsj%4(rB+{cd@h5M>@
zyYPSz@f?PVB?=>5OBTEKSp_ray<sOy6i$mLxd9*M6k1UPQX;rk!6hu(23=~AV7i4d
z>9Rv-<B}w?%di~8=!2_(Wvp}e2U%>eN$&+kDy+{q(-itz#a3cAo3Fxmmh>Z=oyn$i
z4UVOK29L+2%3zkI;Kxb$l_6}H3LDaY0e=M)l=c8r*O>N+OvlnM5RwN#78ODIb5MwL
zrx_2JLRM5Y!qFDtDDxRkzL3dgz!#Y07SXMll@%sljECjECajzW*7vX}dmI!Qy-cAr
z!@8(NR_Sn2xoOC-skbQ<>!Y&5JVcgZQ#Sk+wP|FxHceNfHVL1#z|LN=K=y`%Lt%>u
zgZ2XTdfE)v5YQ`4x)C<3^mWWPIj@>R!ywe_{SGEB>&e6y9Zk$Iy;uR`51J;d?UZwp
zNuL3FwA{P_u2GuSllON7QQaeopm!ajOufxeIuM2G?LwI=OlhS^Z?Z~d;}!uy6d!H8
z$e~tgc#Nn(T$>Ul_KOj3hfP*{lsG6xd;^l!6j5barTiGNMM$PyW)OoW(J1EYkpm6y
zo=CaXf}J6L>ur_x#Y}!8h?!C1LX()+Skpk(?5M-um}EABXf-MWt5g&tJ|>LkMkT{0
zv3ETr7vZ&$P~|6FZyU0xwxzYQwV{c6&xcp1;kDsPY20&z6J8`<X?B5F^boK8HC7-d
zfW@miS)WTGCtmfNf+Ya)+H-hcu{XrnA0mPv6OQ>JV;gPjR&Z8;*tJnU9VYp8U^4ZM
zZrJQvL6w$*{T>t(VWHG$YDB9gS2Z_7`yqI2x>83>{Me5~Nmc8>Fm!B|a+;|ugG2aB
zLX`d&p=qc??P7@iDzsHNaV}h(hQepLS$NCPwn9r<w0YK|jlNYAa;TYDABV>-IWir%
z7t!KOMR4-TZXXE}{Aw!Efss>C{SLJcvrqKE9UUU@qX3~t8z(vJcEM)QYKd`Kas7bR
z)G8A_LTiiJgg9qxqDPoW$fb#%Az&sJK!J1`%tob0-Rqk^(c`rT(}kF>kfg;t;e;mc
zBbR7H-6e)$PiMm-TxDt7M;kUI9Wx{@VMz|g?GRd&ftdY5o7#a3`_r<S&ohI~PMvF?
zXM{5yC%rvFFyq2?3^D107c(zfhdU?IE*lL4S;ByDK4u_W7(i_j6Bl91frg@Hy~o@e
zl|CGU_S}vHoOq<XV;&rZUuNO21A|G3HaenF(TGILFNXUl2Aw%X1)T_}bNU`5^pF$&
zLqtl)l0w7)rqkmw2gj9T{uErbgBD!Nv0w?Vb}Yl;a<FKQGFmF!)FV8`es?6|%TUzB
zOb8pYqNRCAU1N3qikjLXtHZ^W#lgb*y2cf&3l}%87_z9YwS^3k`np9!7FSmvFA|EG
zj>?%<Vf$KBZKF7>UE4Ca8g3E(jaS#WI52pLaJHqjrf!)(s%|~K2{JKqO>G2flvzUr
zN|ZaeUAHW%#NV=)HMK_7O)KhKYN5n4e9?-!`qsj_#*tB{YG5QPgc8GJJq;|c$Xq%^
zlVm3CJZG+<ZZ*`y4Yf8{DwqrXdFOl#{T~!nx5|=a70Ok(txK{Fmsy+E?>w#Fdgz8|
zZ-G&!&4qfOAwNaCURv>p<Xp#b2QaQ)K^1@>g>@70gnM;rt6QUyBYYXF4b$>xc`tm5
z_x4PjeA(Tf9K1{tr<G+^p?xAi9j<DV=Pe#K&o^}5JV6*DKEuRkvb6eoY3cAseNxt3
zp_i(LYm?7|2E7)>5N&mhHAoim&vA-~<e^IZu)A_uB!b^3Q_Zp!_=h!BPCT))V&ar>
zR8!aNGjuIgD{CujVYFq*vgTH=7|)0wlvlP=1Aa!htlH3tL?n^gm5o&mwb;AZDl{}?
z$+CvpA<@y@5d47lv>`PuRX$%)Vat-*`ue!0ZZUY(Lt2`v4Hs86*R>kw`KdBeU$wY}
z8e4EE-(W3T5vg2M)dGJuR5jz~fEI(=Tw7IRc(JarX+^7%>pJAWV<NaBfFkwFst|%!
z{Ghv;>X$Vx_8K1}Sk=unEla5qfx;L6`fO<>{O7#6693U#(SYb`TGm35=%3cXH}YIv
zV<mdsQCem7lBHC;x(=pV;8DU~)oWXpbnpoh+Z3fpb^S5~V?|>N{={Be<E>s&)eJjD
z6x1P|R(J4PTt@g&q)#;Ha6}}CaHz7fsj3F4CDO7uh?Egh(O8T|){4dzEwwd%k@_Oo
zf-PcFr0Vz+W_!o@d|d|I-ol2~6^*qca4AD=b6qvGRn65)MxcTy#6N}$7kdj&^m_}9
zT-4A6MN4Z{_0mG5Vk`3WvgSgRc0{cx^wo&%%0~Rh-q19i9`&q03V|pDqYyH)Smyv&
zMjAQ>Nch=E;l^czY|@Og?ntC1Rdq-o5q>lDi{PIXP0_p}suFOGyab0EmeoY_xsh#*
z2x@Fuv1k$gC`^_3!#RpUtWb!EGV*^k8Y|7jHxe3&B?2wfm658tdN|)9N3=FoH=Q1r
zgG&t}&!JoxkuP!>JQ}Y8q6!lK2o)8W@t`>~9$W$)Qx1reX2q1T2@shud=X>TPR}e;
zx6njf7!p(7A1BQeQwQRS*~N@*UX*5jn@v31L|C-RRPH!Nidpo^F_Ni@LxHJ^`JYid
zC<9Ga_dz+{;Ov+kF@uawio0VfVyp^NuhI{{ql10i(^ak;t736kYa0F!A`9ap@#sEg
zYGzl*W3l#klVPLyFz&bnechD#_LyvJpmv;W%pKiSAS1<;xL(tEkkT=wvDF&HXza&y
z#z7NMAsAD}rqH@irl`H>U78A{Z%o<I;H=|(8_Q0Y#tqJnDb=`daa&51oe(D(TUlbR
z3OwGTSuu-tn{xPxUrc#loHQe*d?HSo8&e*0ZidM`4q~C?<hvoJ7IQV?VKgTep+k?6
zqP%0@*dRTYtu<T-w-R?v_PHTO@teXZeKF;aCh3^)F$<%ruJOS!vC;J@HVHK*GFnt;
zDpwsNtu>V~8yQu_O8N(Jsp^4d2*dG^R>hRD9Kh*VHD?+$XJ63~G6_x-Jth{I7%Qo!
zHV!fRP?Ivs#EQ5?XT@YwJKe;t#k;fg|Ah`6EDH0o<D?|?{$_Pcm#DQ{x*T5@>}Z*U
zRl=j)%B%5p!_M*++~8K$kz}YPtU|`t={hncM0;$#uQMGN?WLD>BpGVly%=8~?Z}jH
zS;pGl-N`3XSvTtg5+)I1>%E=r(J8jx+L`X8g;IG(lA)F`%b4ctj_#(1Q$}|q@1X<T
z(vzqtt2_B*633OhJ`$x+Y#q2GPomWL`fq1?8oe-LQAd)YmMERNTc?z8JI>)z-N~b}
zFxPiGS|(u^UelQF<g;mKxAa~ejmOpa+k9NzQ6Z7Hqc*M<e?I8$k#K`gcaMa-*SdR*
zU#54%ki><{T^<I#Oy9S6A&cu;P#TYqXR&V2k>B=CE$d`NRJ($HxC?!-N$<M<U@Yhi
z*A`wdHX6^uAL5ybmvxiwXMis3kL_;|vf|l*mvxhWZUMc!aD5~UiF_U^H&n5Y=n4q?
z*Pc*jkVN>!qr+pupF2!{Mi<U-u^s3_ev2s|>vkB+R^8AwZu;LWzsA{6*xwg)M#st*
z<|2$|xIxdRJG;csVw1kJ3;kS^etQ@C4JQ4zF7(Gu`kpTI_e}b&UFg4>^xa+PshC+1
z@pDTTx|o3%^qafTXPWd~UFgeA`b}Nvmzwk&yU_18>34LYKMA^#Pt1H6Yw&Pupy8jp
zy2yV7dUyHca1V4br`etSM@Fh0Uw=xN=j={C0d)KWg!be{^P$~L_dW(ai`I@=WKf_r
zo<I-w3^VQ7XWFyJl&|PP{=^>WF?)6njQS^L&-|GEg;9E}L(yW=514e9>Fy?zez*($
z7L)EY*RPXI`6o<zkx7qrRNgk}(@c7@DSz0cH<|R<J|e3Ujn6G6J+|s2R%%2%@9kp$
zFq3}3q{sX-%cOs9(jBI|xZ|<gbUdpEI&RK}yIFLBnU1k~V|x$ss7|`I|3<@}Bno$F
zH~(VLkFTGFce~4Pk3t>`hsU+oe>dzozTJG=kWV?T{T26(VOILM_WF0jp5xn7mx@Vn
zVWi75(4af0aDSP>S#I)6dXS$D`i-bh4w~gR7N6&t_MG-r)E==3Ks?((cR@b3u!C%4
zJXe|WCHIvXdNH>sp4(0NSo<Ec{|Q4rD%0`%dJlBmHi_^$XxQ^*hWL4=Au(RgMLs-~
z6fH;nOnNV@o(cKbyk)F&-^ZYvLLFcz=ptOPMKM&%#xvTqXZ<f_hCY^WPd4c<Jr$)N
zHMp1<HtCl=7p2DrTjztGC;U?xrNi6CBkp$SE*<}A+OyNNXR|5)MGx|Sn)1cF$_z=L
zDK9Pq=+1upEz*de>z<6t$NXPv(hvO+rLQpUIT3Ux%H0;T+?iyW1G)&;PBUDwG3j!X
ze(2PwJ+biq#k9ZIEm3-G<hkFZ|LuB{ZaQ$#q|bXbN;l4mf%ma#|K6%7eXuF7VT2~q
zF}4gE%bz;vB0pU7K~#Q_s2K2MfG+a?-Y)rnf+_#jBV~r%i^dl#_$K|0@1yj4OnQSs
zH#HsLX;c0Qvs@Twn!$e;U0ZKA{bBs@tK0PYr3ZSVEgG)3A1E_i-HCQoJQ*haw$I88
z{aQ0a=Noi`)aipCj3M5RjbCE%Hr}9TQ)~q}MxSchfBpWb{R>RnPVT|}wWj=*lcVzg
zH0AH>LH-}6eA-J<`KL^I+zH-o{QPdp+wYCa4>0AO=m3axA8F?2n@oBx=px@XnfW$W
z-cK>)O`#63tOxpf)Bb0kE;FQKBNN>9Y2?G$T6Zkot_NMXJI&0usNRj|b(6lOy3EjD
z5+zeII?Uab-=PNGL9-r@$`3N-$C>oKg;Bb3M+bP5da!4iDZlUPsJw9}1mw@|LH=^k
zMR_^UrMx_2+VjZ`QG1NFb=dPJ=%PHv_7%k9;eAtn?Tn~=Z2nGMVkXjS@4+bDI7<n8
z@XtRZ-qxD&RsfoK@YC;Z<0oR!9rW(|WrkcV|19r8ep?Up+dy~WQv20i>b(a|``4S{
zU1Yd|<4Y!e;fqmsA2#V9gD%p2OPBnXjs~T>^bHzx2hIL7YEKo5S~L}O(cXp4_Abq|
zf1@d1aY<C(xT6HTD?k_NMdtPjuPOhDA#Z9rz^|YSe=b~DW>7yhdH+U3Cj9B`6Q!GE
z8i<Z-cm5x3&=LR7MCD`toYRB+${y$!_dwrm+W)&*|HOQCUk~#5b20qspvAkQk`I~j
zi9Z+{^dBCJ(qrlPc@OsdVao5D6P16{v?s*{d$@zVQaOpIzwyN;CP9b0zVFHm%10*M
z2f9edX=XZ3FzK}hJ)8EJ`E9OA-)`7rYC6DEJ<#9mf&Oz3^xl1iKaZc++K#U0|7q=7
zfaEBT^B_UMPI5p%E}KVu1PcSPHamMSoiXvA?pD&FlXSj^EM2fiv%7P5D{dd&$LWp)
zM=3)k_z@sZQX*A0Bvdd|pe*umAdW$R6(Ch$Ikt;9#-IuzmDp7&Oenxq3g_#`zdQ5q
z%t??;-R;cu^#AU^yZ`>Dd*a6T0|3&zHm~iNd5r+a+}^9-$Z{>&ml9W30Zw%Ipv5;9
z6u)+@;2S=0gyEb;<mVa1zxZ2%{}FxqoWk#WRN#gW|ChpDou^ZZPw!*kQP;5na2l6e
z&y#U9bi1ALq33)(%W3{mQ|?jxqrW8hmni%<6Y{?!0sa)=as1&e#_w_d>R+VX52~P|
zS5b+s&i$sqU)6FiW4PXgPQwZCoq$uhy~>x&`tCNy?{QvGea!URT?#*-^X2!njISo-
z|G4IVyp&~3jsb|Pv$61u>;DfkyazFL1)pQ&0Dp(VC-(`Q>|%7xCgf8EoW{}7aZC%k
zb5{cX6M$1cWVIiRzKSJTY&|zHyvMmh`TUUPzbOHKo8rIuaF#I*ofH!AZ&mz9UKae<
zH2*(WxOr>D?1~;q$ma)&zftwT8#SMo0H^tD-XS#emwf1nuD+)A98-LHdxzlP`fsVv
z=M?@Pg`aUu;J>QyL4}{M_5TZnk1PDGX9VBO(^-Xo^Tz@&ua^Y#fYZ3tRPGyn=}#5^
zu^$QkX9V4OK;c=f|FXiLWVqghPR}X+p}8zWPdiN>o{x!7?X4-j8TeIz6J4b({!mr?
z+g_J^?vres-vONJKWWj`LyWITq0{rfEae_jJDbsCzpMFw`Vq<B^xI1b`TSJNwe{P6
z7<Q;WhkjS`nbopC4mh=Izqaew6#iL-AGg|jKj73q-d{>SHz@u?4A+~`>7NqdKhk`@
z_Lk&xp7#H%3HTRWK;?EHzn%b}1DxBd^kekK7tf@8y7!4b&H3P?9!iHBwWwPZKA`P2
z<8lw+G>@{XU!A4+U)6G*FJu`)vF7M_mhhoZ+Uw{SIe)<k|GtamZ;!KhUzWkX=5vw4
zd!H2eK25X<aBA<-1%a<qc~JwrS8>9p8#SMU4+%cU%mTt6Gdzqb*Y_sCzmov}p9FZ%
zMN-cbN{1I}yRv{&f9}`*oYX4p0G#OUmB+ITG4wyc{7R8Rr@Z2?YX6vVuPOZ7FGxOy
zzpX0#IUV2M(7Zm!@KDBjd57X>CnX=+x1i(E1pN13EcO4{ce9Mgv1o8L$Z!Tle)a->
z3HrfNeQB>g{Xzo%y^MbjTogtab5QYLWPI!&&1M<$xWZpoc>mo3zg_9%-O7KC&j{Si
z#|sqxQRV-)YCfA7t~a66NCJFO^LgY6$=}yB%L@PCmj(W33jZwNOb?dc_+`a^>VCoJ
zSa(Q6@z<Gee@x(~>9{<h@FAT?uI5j1+R3hE?@PYK_=*%doejdH{NMeo<Ug&?HYz-=
za?rHv#WN`Xi=A_}=0myQolxM@Jnbz?J`{7Bj+;0ixeotQe0VDNZk3yc9_~uO|69es
zL+PJmO(Da_6n^L%S<Zex|2obO0cSqx30%p+Ik#8J{hOZ%9B6!O>?L^j@nMGdIRE-s
zmLI;szj)32l6d^7URf8Mu*UZez^T1ck7PLqOmjYt^a}j@Y8T=dR(SHL!vE-K7I%2p
znt$=(>R!RWb8nU*_bB{Lg_pl5aE`T!XTO+G&tAZ39Bsb+A;t%Pw(5CxLO%Hf_}3HQ
zPiy{HsGf7X*8lqo-*-al&oPzo?1aMaTsOj)f#Sc+aJ>ng-v2Aw&tJ+i^n3cuRrrxl
z2;A_uWx$C}j^_pbkm5h3_!qn&@Xss!Ifm;^=(O%qDfjyC2>wl{$;0<4{NslO{;;OG
zoZ)&CI_**Xn_dz8ztd++fRjFRMD-R!54R}%tPRpX+ZF$*g#2GhfWMgl-|zvU=O-<C
z9tE7}`NwZcJ*Tz3GYm&upvwiW)p4F#FMRJ-&F6G|(9(QPd_(ZBQTS~NfBM@3zhCqJ
zio!Q3oy$|ixl;HO&kJN)^Z5tBX`O!5s{d<>|J2U}|F;yMUVVt;1Lrfm$N7ZelUkom
zfD=DIYRUJZgnV`bPV{!dmvaA8%Y8xN`xSnM!q0=CkE_pR4DWG*TO=Qj(Tg-L;JunQ
zeA=0iPoVky_0cS&8~%0!;Box(frNaHCBWauc9<ULob@A||2nPI6%5y#&}lpYey!#+
z_iC20Z_tbmB;bD<aH{{ypA`Hnw2UV-pZ&W9{=C9}n2^s~fYZGD54DFtBjMxH%Z2`*
zeJ#uPdj#wZGhA;%r`ZJfqULkA@)@(<{Ott%KLDKSbJ()89ZSIfw*>f^=n(23=jJTe
zl45GpaWmk(d=>pU40s$JKFaVO=R%biLz?W_gnWLIkk1)cN`G!Rk>wo5G@o-Bt~a66
zM#bOsdX}N4|F2T`l@ANt$k~5?Z+!iyfKPg9P5A@R_-JXl=_@541v`JD`ER&g;2e`0
z5ARd>(P@F7ru@Mv)SHcFYknTy2%dM%=#)1xK0V`k4m=`)g%UnASBEE{2mgdh0QU;7
zSoIc`s(F9OE8@-mhUd3doI<r+TMC*%F*P(WOmFYUAmJtbO38yyp1%r5gJylznXkh)
z#4EPSW%2?D-Fd)lMo{63xH8}L(m48q!2!OYn@xirJV^F>dAv|hpUy*FYks3qt)vRA
zqCX$FsY0#h%=^uJsWRp_eaD-d$l$Ay^mMjj+P9@d-`ksQmX?+oAuR}7{pM&33Ae)&
zfLvkbc1<^j@MvYlb=?eH07?bC^G|L}<Av#_>ypm~eZ>z>Qsaf(w2z;hoAGwSU!(%O
zR-rjQ3DEM4hi~~!H#2F}_@G}edg8ZGMPh#;Rch4zRA2i4r??$j-sY2(hmVI-&-9ZM
zLkS)%+(A=1{3A=!FYW+F$#0JF_cdq8g#%qO2(Ok_qdC}E1*Mb+i$N2<4{o{<e5!>}
zF8NCneYnnrP&ZTOqk#NGU(K(FpSYRYl3xfG;f4fPo{5a-uK6^x=F`44pZ2f$bYRV=
zgKIt=TJ!1VHJ=Wz@w5-0HVr>E{km;p_m+_f3|uSkjeBFcEwkIu+0)I5&7N1P(oC{E
zj*Oof-1@FdVdm$ZQmp{@AD;h{Q@NR$ecsmDU86JOyLWkBsz1GX*tsrP#iT6MgE9t{
z@Zo@0aK<M`;Se*vYkbCOmRp!UfE9fV?qac(lw1Gs5GH#}F#~DPVC(~zT$$ohxPu7~
zGMGNGDKf;I;<D-^52V-!NoDjWmC>J6#(*om<IfW-Y_6h<{6?Wv@)qH*7SwTja1pGa
zQNtXdKGq)fysc9sJ9A!c*BH36uxN7)Z!IskT@kjAO*!7!zFi|b$47w|VWMzl3IpJ(
zRLAsH+S$Ez>vV3$n;F?Mk)ski?rYiXwu$jAqh2P}m+F^s_r%N3^WZs#50S!k$#0gb
zl@4YmMa;zJ4DO)%j(NZj7OM5t+;VOZ4sop|I2$U_u#a86y6TWeyHKUqglC{fu373Z
z6)lK7${IDRbO7PqSPr77Otl_p80C8ht9AJF&CTqw7?_(8Mn<ixf;p)tmb&R#8LGP(
z*B)}B=#hsXTo0R7u>3lYA-UDTN!4pqi{u<u;)ZC;2wr4nO5AZrl#{l(q+zMU%r?1N
z?Jn#m@5&s67TC6NFzeK@Gi1yK&@1@sJf?Bdnv@#i9$0t8p*9wYs#}?FVKk%_VcL~8
z!VT2%f}+UTUEz<L@;9C3^QJLvb^D{u%JjkQ*9T$9D3JUtmWar5B~#<;gL;+S2&Li%
ztPO!kCP=BOPrKw)bePA|=tK2|lw^${AoAF@r;CiyDw1B^G^}08&Z2F4X2(A9ylm70
zC|K`AhZu$m;n(1xTg*-FD3o<@_U`pCz^ROD+D>kuP}2gx9uys_<+ea&bv6tCXM)t5
zsYW&Bx~VjIUvj6%4mGAFT!705ikVik(1fdEP{!y-QP~%gJSjc$B~>p~>AhlLcmx${
z)QCMkh_AaM-W<M1t@8k63@PT#4KW7_>XT5_7X77pd<8so$&K5|MZ{&OBPbjW+L{mT
zZ~@iCm=zYY+0ngwN4AW6ZmQ3891}Lw8R0S!vMvVj<Fskul&0ob1~i3!tJb33hJIfP
zEPkb=L-h!lPr$?3IH1~vD?7+!3_FcZ9VX>&ZemioVeZzZ;-PZTXyAkETLxIS#Vn(7
zn+ApW<uQ0YgCi$&G_g0ZHn@JXzb$n(j|;KbV5Ja%4U=f-?sIacCPt4ow@Srm7sbxI
zwZw<^Ur=FRU8;||T6=SDy0bJ0kq`)qXlkDM3D^XCUwiq^bnPB>i)FaPcA;ivuvkE8
z)q@^}b)mT^EDQ5g+a%Lexe>`S?%N3;U#wr;!E7<EYs*0yGtlZ5EOn}w4!jr%kqp}F
z;cM_13N59u7(mnzzW^WP;prk_Lk~$*M8pfBlB5Vk<_M83M*u)DCP>tAOQN3AY{^m8
z8>^K92@>SeB@Rzg;Vuf!xo)d+U8Q=kLIVOv@)!ozo?7iH_LJCl`#eMvs02$MrbMMV
zz6+G%Vr7KI3TPW(EZf|{Nxj=zt&eojVm{;E`K~*E&0RSfzS-`kMfiINLoj}_ue;M)
zs=3-hi+<(329__MdC%pRa=Gbk9)6c+(%y^<eeRveoH5XMgn~)XEH^4b5mCYn(IS@A
zq{@<=fOx?|i<Y-_tT+&(A;Jd0halDGqZxS~*6u+a5gV|Ej6OmcG)<8`{QC>a`;BfI
z(qxDjDY`&ZEdW<_%GY(Pgy0VuR+A3~rG>?2W3n(wvS!+IPz(ibXbbXUiRxQHGy|e>
zTWdTCb<(m8X<V`^tgQI?(z5H4Az=y4^(a&ZRP*_2F4957TioF`Z>cW8*ubp5i?uWw
z1;4u|sr%>aK@uB|uJB>$5F1FY0$$uLqb1-*WY&{{5ToTMsz=^jKNh4)@hav|#++jr
zHi@R4K^fr`;%X{%0yU5WnuI7*6mTQ*F`R8#0YTKPc8Q=aK4!)^UW76OBWc*Gum*?$
zpSBpGt>xy7l9r>T#x7{`aSfv8j`1R4ThJV7!rYW^HDfW~Hj^h2CjDivN@Nz3-d#~;
z9|DPc+)O#QpLlAaX-(5^&BJcBn}%hmwCqD#Hgi)un7O7^he05Yj)t$&oTjc0V-5V+
zsWEPpIdqr0_F%2GzDSrDaYLikVx@t5(3=}*ksc)rvbsk6=+u=KWHE$cDw`mTg-%2p
z8)XI<SDEBKNXXW#9ciL0RTnz7RBj4(%aWb+XwolA<yKkQx-iGC6lSeUeF54*KvyLm
z>C=wAwasqg8q#LDGLu9p??jI>lSbymWF1fH%~H^a%L+4F7cd&_D8vssmFzHi(E^uC
z!9hcE)B}tOP2R5|zD^(w>!j(7zDy;F<I`?LM`&2wpsjQ0=(}<2d)DSVdGX+ft0pT;
zSRMj|Lz*itO-5xlV0~RocQc~$lo6Mw4q<1OSFRViLAp#=x&glk$v!zhxi}5QnSP0u
zZEf$Qv>Xv4u!E7#XxlzaCh@G+5H1W_jcQ?d*s}%TP`HcOVxi%cq7;jA-+{@66eFr~
zG^uOLJ?a}<0Ic0CRub&)j4nFbLWb87m{qJh<jDrtX&S{)f_8-|P2q+*a5mo=;~lAl
zZeJZ!6lOBfD`>&0*t%vR(F36~;WwHrqT;3)S)RgCkHY13_fo{T#w+ADaYLVg+PxYX
zI<qDxQBC$4id5E3Ezv&eKe*^OdG#{0b5w1n+7NC8_$XFIaZOR?5N10vr6amu60Ih1
z*{+3&<6X1t8I#B;X@J;O)1t9|MA|Z`Rvb#J)0u2jjcm$n#R5_m*WKz`M`xJEk_wB{
z3M_MRO(q4=Dx^)>Z3gA`%lfv;ODPydnNx9RLKk&HG1jyhhG}9mj@Tcy4#LE&_=zbK
z)tN@KeU}*-nM-0c3O56Nzu*}aeSRAHd5CinMRXDKXohu*x4m7Li_8EuuA%ZQe<J1q
zUf<9{1R7tk0^1&TKP?5`Qu)bJ&?ucm>o+`YxtpoQaFq5KttCW=v0$zT)jIgM*f=IJ
zY`o{gvEwLw+7e87u)A?j=WAF6d8Lkw<JPlSln{SWk|AoFguQfRZB8LnVC10|uaixO
zpM&s4Ggmw!DPGpGFjnJzSR$GBK=UV*pbjobhG=3>w9^3@(?ul8Dk4L!#M&`0^PrpD
ziAh1`u*@fWOE)t*r4i|73ecr}%iJUV)j)hpveh=LmZmL_X=F_0sE<S;$gjfuwlYth
z(4NlG4cbh+Xl3EnX35-485#|AmpC?JwlwM<+s&>03?uwb!Y6Yj%W@MY0CYL?N=!)V
z7qQxfn~%7qFn^=f?q+^1pqO8%wSX4*l~!$XvuT-F|4hR0ju054Z!ZPCJXYyKMxn}3
zU9PA8oo;R}RT~^xuzwuedZV`GM3CP0ivwv`{p$g5i08`XSi}E(P_-;59L>^eECm5m
zO$N4m&m#fh6&fwx03;QjCTy#*I127H3L{Kup@p!CY^d#)CEDZ=Enp>&##k%)wRtr0
zlP;!h5y&acoV8YNG9$1YC}3&n(MDBttuRU}k~k?S3Bx8}qJlPg7_*P`TeWsd-DXHl
z?E&q5s4k^2s=cVCD5_kZQqLeIizHZmV|F^wAd1ZzoxgaSBgw!UW7!$dt%I1YuZT_$
zHiW7ytgOHSUB{a^{t^tA-f}u3977XHSe7gonP<sxgd}ccG;jk8py2RMIqdH9#NcE0
z>ei?-GVpkJ#vxr~!ECBwqr0>GZPg$_g2=8wC#0?PHfu(amUTF8d)WT3R%NWxNJ0<I
zlgd}E!yp~C9qTGCmn(|(cf_*(Mybg#;sgmmTcLRpe^FZL3>X5I&g{hWH!n{?&8Anw
zW>`u0W-AA2BYX<``n>->I*MJ&w3$mR#+gUAMTNDHI!Wz#Q;mbjMytns`0cV-+E*3k
zpPMQh3g~RBh8QQkuc7lxOxk_{Myxt?=g`hp<5Tsf#g5xtfaNsAc(G+|2cqH@A6(O5
zR#J=E09(OGz1EF!On{h8Q)YnEYi$F^EaB}6Aog?n^^Cj+D*#p~v}hqQvXvJQ-BJVk
zOfgoUAqr?~P*77G#Ow}2t5ja-BpcY{iI#XD3ptk3&}mE71}jxnY{nzB$rL99V&*Mg
zM+<HF@IFu<-q6dD&GK!|$>k|FbPfh90ln6PrBtoDR4S~F&v?0<*{AUq%6K4c@T7Y&
z<F|#(zOh!W&eRlBm70TSmd4J}`I!9zT9naNKdsB=nIlK%SDOMB!xRuIImE?5iZkO~
z%C_dGrUjm_QN5{a$H_2_$nrbhdgVB&#%dX%nsJ7%%;!aOg)rGcz2>AUFnXo1^}h*Y
zsCS5IpQ0em#ZCCsLyGPeeMIO^6;~_BPtJ%D&QF#h%`kA08xPO0yX8{?y{h4LrWEZd
zIVoJG7OJ?wUJ`zpiKOb-;KzHYsme0FONqCJ=u!-#(~v_@U7S>~sBgnA7Eyq?L&*i~
z0r9Rn-V`O92p1$T%1Zv``al5*Q7OM%Dj;38NtFSFi-tbyr0~kF6jluKtp%j>D+`#^
zDY>QnD0A)S^L3cC^$Fe@M7e^69yXUyF3hFoWZp-9r{@cikX}(X@8fS?m&F}EJ*9w)
z_^$I~US~FtOL1`~{!RLsby?gwh`-9Gu6M{!7jXO6#Z@?cb6u97H8nllGa`qbp570C
z4c|I6<sV*`W&FbmxR&qmIR`zTrDt>6g^QoZze)ebx-9PKJCJ;G_R>QfKZk#6yGd`}
z52yFd>6Lkt-h5AT60g+LVZI+Jcan#>rt>H9Pv34d=}r9D&+7~8uPCyqzri!->+qbu
z1!?Y0oY_P0|E7aJHAM&2pZ-n#2}Z~D_@_QK=}kP^Big`w2nmNN-=sI~zY)(U{T4#P
zVdB>Ag`YDW=KGhFo@mWZ|NDSZ6{&7??7uXtXFeekYxJP9!pF_H38z1#=?@WM;y7wD
zmRopC$YIWZh}&>_6E8Q5_xR~B^_9Es`!C@^IK7FhJE`e^`p+h%oK3j~dY6^n#NVA*
zPq}siGxH68?DpRa7-5?6+rNH<QTJ>52RHF;`=_^kB^M9i6i#pA`M%ec^6z{pn!xUl
zzqitxxW5m+M`~o~#N=Y9f7(iK;sYPo^w%1~(+uqP|G-Lb;s~cNlJb9nh%R*4<x}&*
z?Kkm;KQbigkA>3GrCt8(fYDSZ9zchQOMK7Or`CQNrn8aGq&MHMJ*w&L`r7wrBR$pM
zq&IPnpFG9%6o{Uth)HkaAs^TDz0!>K{50hoIzEiF;qpz~<d;7x2@M|5_%P|s`C_a7
z?fA-BNnbM+OGtm2mEQO}*G46Muck96Q?9xH5YiLO6l3B!-m2;8i)G=XH++L@nkM*j
z(iYsLAF$wrONTjAWO{QgPvrNQmcKs?#I=n(o{)aym{dGxHP}u+n~?s%grqnA^QIbh
z`a(kb^}8kg=~n;Q>8qOF?!PlXCJFB~1EUo%>CO3iO>gRF&PP-MdEJ_@CY`|}|6ChJ
z&%Uxx%D>Yf>e<FU7jUbJjQPF$CH)tz1lBE=ZFLc+-##bMCoh+R>2v`9W;hLBeHUPH
S<)8BzNq<b`kmS<c*8c@<49|W5

diff --git a/src/dsaX_beamformer_passon.cu b/src/dsaX_beamformer_passon.cu
deleted file mode 100644
index 818c28a..0000000
--- a/src/dsaX_beamformer_passon.cu
+++ /dev/null
@@ -1,1057 +0,0 @@
-// -*- c++ -*-       
-/* will implement the 64-input beamformer 
-
-does N beams of 256
-
-order is (taking time as 8x 8.192e-6) 
-[2048 time, 63 antennas, 768 channels, 2 pol, r/i]
-Load in 16 times at a time, so that we have (in units of what needs to be added)
-[16 time, 63 antennas, 96 channels, 8 chunnels, 2 pol, r/i]
-
-This should be reordered on the cpu to 
-[16 time, 96 channels, 63 antennas, 8 chunnels, 2 pol, r/i]
-
-The first kernel, launched with 1536 blocks of 64 threads, needs to
- - promote each measurement and store in shared mem, parallelizing over ants. need only 8 kB. 
- - each thread processes 4 beams, adding everything. for each beam,
-  + for each chunnel and pol, calculate weights using cal weights and ant positions, 
-  + add everything into output array
-Output array has order [beam, 96 frequency, 16 time]
-
-Shared mem requirement: 8 kB for promoted data, 512b for positions, nch*1024b for weights
-
-Initialy we start with 4-bit numbers. these are first rotated using 17-bit weights, yielding 22-bit numbers. 
-these are then added: (64 ant)^2 * (2 complex) * (32 chan) * (2 pol) * (16 time). 
-after adding by 64 ants, we have 28-bit numbers. Need to bit shift right by 19 after adding 64 ants. This will yield 29-bit numbers. Need to bit shift right by 21 to pick off lowest 8 bits. 
-
-Do everything in floating point until second kernel. 
-
-Second kernel will simply add times and adjacent channels and pick leading 8 bits
-Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
-
- */
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <mma.h>
-#include <cuda.h>
-#include "cuda_fp16.h"
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include <thrust/device_ptr.h>
-#include <thrust/fill.h>
-
-#include <cuda_runtime_api.h>
-using namespace nvcuda;
-
-#define sep 1.0
-
-// global variables
-int DEBUG = 0;
-
-
-// kernel for summing and requantizing
-// input array has order [beam, 48 frequency, 2 pol, 16 time]
-// need to output to [4 time, beam, 48 frequency]
-// bp is scale factor for each beam 
-// run with 256*48=12288 blocks and 32 threads
-__global__
-void adder(float *input, unsigned char *output, float *bp) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 256*48=12288
-  int tidx = threadIdx.x; // assume 32
-  //int fidx = 2*(bidx % 24);
-  int beamidx = (int)(bidx / 48);
-  
-  // declare shared mem
-  __shared__ float data[32]; // data block to be summed  
-
-  // transfer from input to shared mem
-  data[tidx] = input[bidx*32];
-
-  // sync
-  __syncthreads();
-
-  // complete sum
-  if (tidx<16) {
-    data[tidx] += data[tidx+16]; // over pols
-
-    data[tidx] += data[tidx+2];
-    data[tidx] += data[tidx+1];
-  }
-  // now tidx = 0, 4, 8, 12 are what we want! 
-
-  __syncthreads();
-  
-  // store
-  if (tidx == 0) 
-    output[bidx] = (unsigned char)(__float2int_rn(data[0]*bp[beamidx])/2);
-  if (tidx == 4) 
-    output[bidx + 12288] = (unsigned char)(__float2int_rn(data[4]*bp[beamidx])/2);
-  if (tidx == 8) 
-    output[bidx + 2*12288] = (unsigned char)(__float2int_rn(data[8]*bp[beamidx])/2);
-  if (tidx == 12) 
-    output[bidx + 3*12288] = (unsigned char)(__float2int_rn(data[12]*bp[beamidx])/2);
-      
-}
-
-// kernel for promotion
-/*
-orig input is [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-input is [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-output needs to be [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] 
-promoted to half precision  
-
-launch with 16*48*NANT blocks of 32 threads
-
- */
-__global__ void promoter(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x; // assume 16*48*NANT
-  int tidx = threadIdx.x; // assume 32
-  int iidx = bidx*32+tidx;
-  int pol = (int)(tidx % 2);
-  int chunnel = (int)(tidx / 2);
-  
-  /*int ant = (int)(bidx % NANT);
-  int time_chan = (int)(bidx / NANT);    
-  int oidx = time_chan*2048+pol*1024+ant*16+chunnel;*/
-
-  int chan = (int)(bidx % 48);
-  int time_ant = (int)(bidx / 48);
-  int tim = (int)(time_ant / NANT);
-  int ant = (int)(time_ant % NANT);
-  int oidx = tim*98304 + chan*2048 + pol*1024 + ant*16 + chunnel;
-
-  inr[oidx] = __float2half((float)(((char)((input[iidx] & 15) << 4)) >> 4));
-  ini[oidx] = __float2half((float)(((char)((input[iidx] & 240))) >> 4));
-
-}
-
-// 16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels
-// for first time, launch with 3072, 32
-__global__ void printer(half *inr, half *ini) {
-
-  int idx = blockIdx.x*32+threadIdx.x;
-  float ir = __half2float(inr[idx]);
-  float ii = __half2float(ini[idx]);
-
-  int chunnel = (int)(threadIdx.x % 16);
-  int channel = (int)(blockIdx.x/64);
-  int tt = (int)(blockIdx.x % 64);
-  int pol = (int)(tt/32);
-  int ant = ((int)(tt % 32))*((int)(threadIdx.x / 16));
-  
-  if (ir!=0. || ii!=0.) {
-    printf("%d %d %d %d %f %f\n",channel,pol,ant,chunnel,ir,ii);
-  }
-  
-}
-
-
-// kernel for beamforming
-/*
-
-Assumes that up to NANT antennas (nominally 63) are populated. 
-
-Input is [16 time, 48 channels, 2 pol, 64 antennas, 16 chunnels, r/i] (promoted)
-
-Arithmetic... for rotation, d2r = wr*dr-wi*di; d2i = wi*dr+wr*di
-
-Conventions for beamforming. beam 0 is furthest East, beam 127 is at meridian. antpos (D) is easting. 
-for bf weight calculation, where theta = s(127-n), ang = 2*pi*nu*theta*D/c; wr = cos(ang), wi = sin(ang)
-use __float2int_rn, cosf, sinf intrinsics. 
-
-Each warp (==block) has to deal with 256 beams for 64 ants, summing over 16 chunnels and pols. 
-Do it in tiles of 16 beams and 16 ants for 
-
-Output array has order [beam, 48 frequency, 2 pol, 16 time]
-
-inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]
-
-launch with 16time * 48freq * 2pol * 16beam_tile blocks of 32 threads for massive utilization
- = 24576 blocks
-
-*/
-__global__ void beamformer(half *inr, half *ini, half *wr, half *wi, float *output, int stuffants) {
-
-  // get block and thread ids
-  int bidx = blockIdx.x; // assume 24576
-  int tidx = threadIdx.x; // assume 32
-  int orig_bidx = (int)(bidx / 16);
-  int beam_tile = (int)(bidx % 16);
-  int stuff_tile = (int)(beam_tile % 4);
-  int data_offset = orig_bidx*1024; // offset for first part of data
-  int weight_offset = (int)(orig_bidx % 96); // offset for first part of weight
-  weight_offset *= 16384;
-  int idx1, idx2;
-  int f_idx = (int)(orig_bidx % 96);
-  int tim_idx = (int)(orig_bidx / 96);
-  int oidx = f_idx*16 + tim_idx;
-  
-  // shared memory for convenience
-  __shared__ float summr[16][16]; // beam, chunnel
-  __shared__ float summi[16][16]; // beam, chunnel
-  
-  // accumulate real and imag parts into [16 beam x 16 f] fragments
-  // Declare the fragments.
-  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
-  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wr_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wr_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wi_inr_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> wi_ini_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, float> ib_frag;
-  
-  // zero out accumulators
-  wmma::fill_fragment(wr_inr_frag, 0.0f);
-  wmma::fill_fragment(wr_ini_frag, 0.0f);
-  wmma::fill_fragment(wi_inr_frag, 0.0f);
-  wmma::fill_fragment(wi_ini_frag, 0.0f);
-  wmma::fill_fragment(ib_frag, 0.0f);
-
-  // IB
-  if (stuffants==2) {
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> d_frag;
-    
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-
-      wmma::load_matrix_sync(c_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-      wmma::load_matrix_sync(c_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::load_matrix_sync(d_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(ib_frag, c_frag, d_frag, ib_frag);
-
-    }
-
-  }
-
-  // one ant per beam
-  if (stuffants==1) {        
-
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> c_frag;
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> d_frag;
-    wmma::load_matrix_sync(c_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, inr + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    wmma::load_matrix_sync(c_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::load_matrix_sync(d_frag, ini + data_offset + stuff_tile*256, 16);
-    wmma::mma_sync(wr_inr_frag, c_frag, d_frag, wr_inr_frag);
-    
-  }
-  if (stuffants!=1) {
-  
-    // loop over ant tiles
-    for (int ant_tile=0; ant_tile<4; ant_tile++) {
-      
-      // copy weight and data to fragments, and multiply to accumulators
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::load_matrix_sync(b_frag, inr + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wr_inr_frag, a_frag, b_frag, wr_inr_frag);
-      
-      wmma::load_matrix_sync(a_frag, wi + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wi_inr_frag, a_frag, b_frag, wi_inr_frag);
-      
-      wmma::load_matrix_sync(b_frag, ini + data_offset + ant_tile*256, 16);
-      wmma::mma_sync(wi_ini_frag, a_frag, b_frag, wi_ini_frag);
-      
-      wmma::load_matrix_sync(a_frag, wr + weight_offset + beam_tile*1024 + ant_tile*256, 16);
-      wmma::mma_sync(wr_ini_frag, a_frag, b_frag, wr_ini_frag);
-      
-    }
-
-    // form real and imaginary matrices
-    for(int i=0; i < wr_inr_frag.num_elements; i++) {
-      wr_inr_frag.x[i] = wr_inr_frag.x[i] - wi_ini_frag.x[i]; // output real
-      wi_inr_frag.x[i] = wi_inr_frag.x[i] + wr_ini_frag.x[i]; // output imag
-      wr_inr_frag.x[i] = wr_inr_frag.x[i]*wr_inr_frag.x[i] + wi_inr_frag.x[i]*wi_inr_frag.x[i]; // squared
-    }
-  }
-
-  // at this stage the matrices are [beam, chunnel], and need to be summed over columns
-    
-  // copy back to shared mem
-  float *p1, *p2, tmp;
-  p1 = &summr[0][0];
-  wmma::store_matrix_sync(p1, wr_inr_frag, 16, wmma::mem_row_major);
-
-  if (stuffants!=1) {
-  
-    // do thread reduction for each beam
-    if (tidx<8) {
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+8];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+4];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+2];
-      for (int i=0;i<4;i++) summr[i][tidx] += summr[i][tidx+1];
-    }
-    if (tidx>=8 && tidx<16) {
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+8-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+4-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+2-8];
-      for (int i=4;i<8;i++) summr[i][tidx-8] += summr[i][tidx+1-8];  
-    }
-    if (tidx>=16 && tidx<24) {
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+8-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+4-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+2-16];
-      for (int i=8;i<12;i++) summr[i][tidx-16] += summr[i][tidx+1-16];  
-    }
-    if (tidx>=24) {
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+8-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+4-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+2-24];
-      for (int i=12;i<16;i++) summr[i][tidx-24] += summr[i][tidx+1-24];  
-    }
-
-    __syncthreads();
-    
-    // now summr[beam][0] can go into output
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][0];
-    }
-
-  }
-
-  if (stuffants==1) {
-    if (tidx<16) {
-      output[(beam_tile*16+tidx)*1536 + oidx] = summr[tidx][tidx];
-    }
-  }
-  if (stuffants==2) {
-
-    p2 = &summi[0][0];
-    wmma::store_matrix_sync(p2, ib_frag, 16, wmma::mem_row_major);      
-    tmp = 0.;
-    for (int i=0;i<16;i++) tmp += summi[i][i];
-    if (tidx==0 && beam_tile==0) 
-      output[(beam_tile*16+tidx)*1536 + oidx] = tmp;
-
-  }      
-  
-}
-
-// kernel to calculate weights - needed because weights are halfs
-// launch with 256 threads in 6144 blocks
-__global__
-void calc_weights(float *antpos, float *weights, float *freqs, half *wr, half *wi) {
-
-  // assume 256 threads in 6144 blocks
-  int bidx = blockIdx.x; // over 48f, 2pol, 16 beam_tile, 4 ant_tile
-  int tidx = threadIdx.x;
-  int f = (int)(bidx / 128);
-  int cc = (int)(bidx % 128);
-  int pol = (int)(cc / 64);
-  cc = (int)(cc % 64);
-  int beam_tile = (int)(cc / 4);
-  int ant_tile = (int)(cc % 4);
-  int beam_i = (int)(tidx / 16);
-  int ant_i = (int)(tidx % 16);
-
-  int beam = beam_tile*16+beam_i;
-  int ant = ant_tile*16+ant_i;
-  int i = bidx*256+tidx;
-  int widx = ant*NW*2*2 + f*2*2 + pol*2;
-  
-  float theta = sep*(127.-beam*1.)*PI/10800.; // radians
-  float afac = -2.*PI*freqs[f*8+4]*theta/CVAC; // factor for rotate
-  float twr = cos(afac*antpos[ant]);
-  float twi = sin(afac*antpos[ant]);
-
-  wr[i] = __float2half((twr*weights[widx] - twi*weights[widx+1]));
-  wi[i] = __float2half((twi*weights[widx] + twr*weights[widx+1]));
-  
-  
-}  
- 
-  
-// function prototypes
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2);
-int dada_bind_thread_to_core (int core);
-int init_weights(char *fnam, float *antpos, float *weights, char *flagants);
-void reorder_block(char *block);
-void calc_bp(float *data, float *bp, int pr);
-
-
-// performs massive summation to calculate bp
-// input array has order [beam, 96 frequency, 16 time]
-// bp has size 48 - no way to avoid strided memory access
-// returns factor to correct data
-void calc_bp(float *data, float *bp, int pr) {
-
-  int i=0;
-  
-  for (int b=0;b<256;b++) {
-    for (int f=0;f<48;f++) {
-      for (int a=0;a<32;a++) {
-	bp[b] += data[i];
-	if (pr && data[i]!=0.) printf("%d %d %d %f\n",b,f,a,data[i]);
-	i++;
-      }
-    }
-  }
-
-}
-
-// performs cpu reorder of block to be loaded to GPU
-void reorder_block(char * block) {
-
-  // from [16 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-  // to [16 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-  // 24576*NANT in total. 1536*NANT per time
-  
-  char * output = (char *)malloc(sizeof(char)*24576*NANT);
-  
-  for (int i=0;i<16;i++) { // over time
-    for (int j=0;j<NANT;j++) { // over ants
-      for (int k=0;k<48;k++) { // over channels
-
-	// copy 32 bytes
-	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
-	
-      }
-    }
-  }
-
-  memcpy(block,output,24576*NANT);
-  free(output);
-
-}
-
-
-// loads in weights
-int init_weights(char * fnam, float *antpos, float *weights, char *flagants) {
-
-  // assumes 64 antennas
-  // antpos: takes only easting
-  // weights: takes [ant, NW==48] 
-
-  FILE *fin;
-  FILE *fants;
-  
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"Couldn't open weights file %s",fnam);
-    return 1;
-  }
-  if (!(fants=fopen(flagants,"r"))) {
-    syslog(LOG_ERR,"Couldn't open flag ants file %s",flagants);
-    return 1;
-  }
-
-  fread(antpos,64*sizeof(float),1,fin);
-  fread(weights,64*NW*2*2*sizeof(float),1,fin);
-  float wnorm;
-  for (int i=0;i<64*NW*2;i++) {
-    wnorm = sqrt(weights[2*i]*weights[2*i] + weights[2*i+1]*weights[2*i+1]);
-    if (wnorm!=0.0) {
-      weights[2*i] /= wnorm;
-      weights[2*i+1] /= wnorm;
-    }
-  }
-	
-
-  int ant;
-  while (!feof(fants)) {
-    fscanf(fants,"%d\n",&ant);
-    for (int j=0;j<NW*2*2;j++) {
-      weights[ant*NW*2*2+j] = 0.0;
-    }
-  }
-      
-  fclose(fants);
-  fclose(fin);
-  if (DEBUG) syslog(LOG_INFO,"Loaded antenna positions and weights");
-  return 0;
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out, dada_hdu_t * out2)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  if (dada_hdu_unlock_write (out2) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out2");
-    }
-  dada_hdu_destroy (out2);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_beamformer [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f filename for antenna stuff [no default]\n"
-	   " -i input key [default REORDER_BLOCK_KEY2]\n"
-	   " -o output key [default BF_BLOCK_KEY]\n"
-	   " -g output key 2 [no default]\n"	   
-	   " -z fch1 in MHz [default 1530]\n"
-	   " -a flagants file\n"
-	   " -s stuffants \n"
-	   " -q do incoherent beam \n"
-	   " -t test pattern \n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_beamformer", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // device properties
-  int nDevices;
-
-  cudaGetDeviceCount(&nDevices);
-  for (int i = 0; i < nDevices; i++) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, i);
-    syslog(LOG_INFO,"Device Number: %d", i);
-    syslog(LOG_INFO,"  Device name: %s", prop.name);
-    syslog(LOG_INFO,"  Memory Clock Rate (KHz): %d",prop.memoryClockRate);
-  }
-  cudaSetDevice(1);
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY2;
-  key_t out_key = BF_BLOCK_KEY, out_key2 = BF_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int stuffants=0;
-  int test_pattern = 0;
-  float fch1 = 1530.0;
-  char * fnam;
-  fnam=(char *)malloc(sizeof(char)*100);
-  sprintf(fnam,"nofile");  
-  char * flagants;
-  flagants=(char *)malloc(sizeof(char)*100);
-  sprintf(flagants,"nofile");  
-
-  while ((arg=getopt(argc,argv,"c:f:i:o:g:z:a:tsqdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'g':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key2) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-g flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'a':
-	  if (optarg)
-	    {
-	      strcpy(flagants,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'z':
-	  if (optarg)
-	    {
-	      fch1 = atof(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-z flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 't':
-	  test_pattern=1;
-	  syslog (LOG_INFO, "Will execute test pattern");
-	  break;
-	case 's':
-	  stuffants=1;
-	  syslog (LOG_INFO, "Will place antennas in output");
-	  break;
-	case 'q':
-	  stuffants=2;
-	  syslog (LOG_INFO, "Will place IB in output");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // print stuff
-  syslog(LOG_INFO,"Forming 256 beams with sep %g arcmin, fch1 %g",sep,fch1);
-  syslog(LOG_INFO,"Using calibrations file %s",fnam);
-  syslog(LOG_INFO,"Using flagants file %s",flagants);
-
-  // load in weights and antpos
-  float * antpos = (float *)malloc(sizeof(float)*64); // easting
-  float * weights = (float *)malloc(sizeof(float)*64*NW*2*2); // complex weights [ant, NW, pol, r/i]
-  float * freqs = (float *)malloc(sizeof(float)*384); // freq
-  for (int i=0;i<384;i++) freqs[i] = (fch1 - i*250./8192.)*1e6;  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out2  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out2, out_key2);
-  if (dada_hdu_connect (hdu_out2) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out2) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  int nints = NPACKETS / 16;
-  uint64_t nbytes_per_int = block_size / nints;
-  uint64_t nbytes_per_out = block_out / nints;
-  char * block;
-  unsigned char * output_buffer;
-  output_buffer = (unsigned char *)malloc(sizeof(unsigned char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-  
-  // allocate host and device memory for calculations
-  //inr and ini are data, in [16 time, 48 freq, 2 pol, 64 ant, 16 chunnels] for real and imag
-  //wr and wi are weights, in [48 freq, 2 pol, 16 beam_tile, 4 ant_tile, 16 beam, 16 ant]        
-  char *d_indata[NSTREAMS];
-  unsigned char *d_outdata[NSTREAMS];
-  float *d_transfer[NSTREAMS], *d_bp, *d_antpos, *d_weights, *d_freqs;
-  half *d_wr, *d_wi, *d_inr[NSTREAMS], *d_ini[NSTREAMS];
-  cudaMalloc((void **)&d_antpos, 64*sizeof(float)); // ant positions
-  cudaMalloc((void **)&d_weights, 64*NW*2*2*sizeof(float)); // weights
-  cudaMalloc((void **)&d_freqs, 384*sizeof(float)); // freqs        
-  cudaMalloc((void **)&d_bp, 256*sizeof(float)); // bandpass
-  cudaMalloc((void **)&d_wr, 48*2*16*4*16*16*sizeof(half)); // real weight
-  cudaMalloc((void **)&d_wi, 48*2*16*4*16*16*sizeof(half)); // imag weight
-  cudaMemcpy(d_freqs, freqs, 384*sizeof(float), cudaMemcpyHostToDevice);
-  
-  float *h_transfer = (float *)malloc(sizeof(float)*256*96*16*NSTREAMS);
-  char *h_indata = (char *)malloc(sizeof(char)*16*NANT*96*8*2);
-  float *bp = (float *)malloc(sizeof(float)*256);
-  unsigned char *tmp_buf = (unsigned char *)malloc(sizeof(unsigned char)*256*48*4*NSTREAMS);  
-  
-  // streams and device  
-  cudaStream_t stream[NSTREAMS];
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamCreate(&stream[st]);
-    cudaMalloc((void **)&d_indata[st], 16*96*NANT*8*2*sizeof(char)); // data input to bf kernel
-    cudaMalloc((void **)&d_outdata[st], 256*48*4*sizeof(unsigned char)); // data output from adder
-    cudaMalloc((void **)&d_transfer[st], 256*96*16*sizeof(float)); // output from beamformer
-    cudaMalloc((void **)&d_inr[st], 16*48*2*64*16*sizeof(half)); // real data
-    cudaMalloc((void **)&d_ini[st], 16*48*2*64*16*sizeof(half)); // real data
-    thrust::device_ptr<half> d1(d_inr[st]);
-    thrust::fill(d1, d1+16*48*2*64*16, 0.0);
-    thrust::device_ptr<half> d2(d_ini[st]);
-    thrust::fill(d2, d2+16*48*2*64*16, 0.0);
-  }
-
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  int blockct = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    blockct ++;
-
-    // write to output
-    /*    written = ipcio_write (hdu_out2->data_block, block, block_out2);
-    if (written < block_out2)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-	}*/
-    
-    // DO STUFF
-
-    // calc weights
-    init_weights(fnam,antpos,weights,flagants);
-    cudaMemcpy(d_antpos, antpos, 64*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_weights, weights, 64*NW*2*2*sizeof(float), cudaMemcpyHostToDevice);  
-    calc_weights<<<6144, 256>>>(d_antpos, d_weights, d_freqs, d_wr, d_wi);
-    if (DEBUG) syslog(LOG_INFO,"Finished with weights");
-    
-    if (started==1) {
-
-      // loop over ints
-      for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-	for (int st=0;st<NSTREAMS;st++) {
-
-
-	  
-	  // copy to h_indata
-	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
-
-	  // rotate h_indata in place
-	  //reorder_block(h_indata);
-	  
-	  // copy to device
-	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	  // do promotion
-	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-	  
-	  // run beamformer kernel
-	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  	  
-	  // run adder kernel
-	  adder<<<12288, 32, 0, stream[st]>>>(d_transfer[st], d_outdata[st], d_bp);
-	  
-	  // copy to host
-	  cudaMemcpyAsync(tmp_buf + 256*48*4*st, d_outdata[st], 256*48*4*sizeof(unsigned char), cudaMemcpyDeviceToHost, stream[st]);
-
-	  // copy to output
-	  for (int j=0;j<12288*4;j++) {
-	    if (test_pattern) 
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = (unsigned char)((blockct % 128) + 32);
-	    else
-	      output_buffer[(bst*NSTREAMS+st)*12288*4+j] = tmp_buf[j+256*48*4*st];
-	  }
-	  if (DEBUG && bst*NSTREAMS+st==10) {
-	    for (int j=0;j<48;j++) syslog(LOG_DEBUG,"%hu",output_buffer[(bst*NSTREAMS+st)*12288+BEAM_OUT*48+j]);
-	  }        
-	  
-	}
-      }
-
-
-    }
-    
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-
-      // calculate bandpass
-
-      for (int i=0;i<256;i++) bp[i] = 0.;
-      
-      // do standard bf but calculate bandpass
-
-      // loop over ints
-      for (int bst=0;bst<nints/NSTREAMS;bst++) {
-
-	for (int st=0;st<NSTREAMS;st++) {
-	  
-	  // copy to h_indata
-	  //memcpy(h_indata,block+(bst*NSTREAMS+st)*nbytes_per_int,nbytes_per_int);
-
-	  // rotate h_indata in place - this is current
-	  //reorder_block(h_indata);
-
-	  // copy to device
-	  //cudaMemcpyAsync(d_indata, h_indata, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-	  cudaMemcpyAsync(d_indata[st], block+(bst*NSTREAMS+st)*nbytes_per_int, 24576*NANT*sizeof(char), cudaMemcpyHostToDevice, stream[st]);
-
-	  // do promotion
-	  promoter<<<16*48*NANT, 32, 0, stream[st]>>>(d_indata[st], d_inr[st], d_ini[st]);
-
-	  //if (bst==0 && st==0) 
-	  //  printer<<<3072, 32>>>(d_inr,d_ini);	  
-	  
-	  // run beamformer kernel
-	  beamformer<<<24576, 32, 0, stream[st]>>>(d_inr[st], d_ini[st], d_wr, d_wi, d_transfer[st], stuffants);
-	  
-	  // copy back to host
-	  cudaMemcpyAsync(h_transfer + st*256*96*16, d_transfer[st], sizeof(float)*393216, cudaMemcpyDeviceToHost, stream[st]);	
-
-	  // calculate bandpass
-	  //if (st==0 && bst==0) 
-	  //calc_bp(h_transfer,bp,1);
-	  calc_bp(h_transfer + st*256*96*16,bp,0);
-
-	}
-      }
-
-      // adjust bandpass
-      syslog(LOG_INFO,"Final BP...");
-      for (int i=0;i<256;i++) {
-	syslog(LOG_INFO,"coeff %d %g",i,bp[i]);
-	if (bp[i]!=0.) {
-	  bp[i] /= 48.*nints; 
-	  bp[i] = 128./bp[i]/4.;
-	}
-      }
-      cudaMemcpy(d_bp, bp, sizeof(float)*256, cudaMemcpyHostToDevice);
-      
-      // junk into output
-      memset(output_buffer,0,block_out);
-      
-    }
-
-    // write output for debug
-    
-    // write to output
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  for (int st=0;st<NSTREAMS;st++) {
-    cudaStreamDestroy(stream[st]);
-    cudaFree(d_indata[st]);
-    cudaFree(d_outdata[st]);
-    cudaFree(d_transfer[st]);
-    cudaFree(d_inr[st]);
-    cudaFree(d_ini[st]);
-  }
-  free(fnam);
-  free(flagants);
-  free(h_indata);
-  free(output_buffer);
-  free(antpos);
-  free(weights);
-  free(freqs);
-  free(bp);
-  free(h_transfer);
-  free(tmp_buf);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_antpos);
-  cudaFree(d_freqs);
-  cudaFree(d_weights);
-  cudaFree(d_wr);
-  cudaFree(d_wi);
-  cudaFree(d_bp);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-  
-}
-
-
diff --git a/src/dsaX_bigfake.c b/src/dsaX_bigfake.c
deleted file mode 100644
index f5e1354..0000000
--- a/src/dsaX_bigfake.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f file to read packet from [default none]\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      useZ = 0;
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  uint64_t npackets = block_out / 4194304;
-  char * block, * output_buffer;
-  char * packet;
-  packet = (char *)malloc(sizeof(char)*4194304);
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // fill output buffer if file exists
-  FILE *fin;
-  if (!useZ) {
-
-    if (!(fin=fopen(fnam,"rb"))) {
-      syslog(LOG_ERR, "cannot open file - will write zeros");
-    }
-    else {
-
-      fread(packet,4194304,1,fin);
-      fclose(fin);
-
-      syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
-      
-      for (int i=0;i<npackets;i++)
-	memcpy(output_buffer+i*4194304,packet,4194304);
-
-      syslog(LOG_INFO, "Using input packet");
-      
-    }
-
-    
-  }
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-    // no need to do anything here - output_buffer is ready to go
-
-    // write to output
-    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(packet);
-  free(output_buffer);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_capture.c b/src/dsaX_capture.c
deleted file mode 100644
index 054e45d..0000000
--- a/src/dsaX_capture.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
-
-1;95;0cmain: runs capture loop, and interfaces dada buffer
-control_thread: deals with control commands
-
-*/
-
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-//#include "multilog.h"
-
-#define unhappies 3000
-#define skips 6
-#define sleeps 1.5
-
-/* global variables */
-int quit_threads = 0;
-char STATE[20];
-uint64_t UTC_START = 10000;
-uint64_t UTC_STOP = 40000000000;
-int MONITOR = 0;
-char iP[100];
-int DEBUG = 0;
-int HISTOGRAM[16];
-int cPort = CAPTURE_CONTROL_PORT;
-int dPort = CAPTURE_PORT;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_capture [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -j IP to listen on for data packets [no default]\n"
-	   " -p PORT to listen to for data packets [default 4011]\n"
-	   " -q PORT to listen to for control commands [default CAPTURE_CONTROL_PORT]\n"
-	   " -i IP to listen on for control commands [no default]\n"	
-	   " -f filename of template dada header [no default]\n"
-	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"	   
-	   " -d send debug messages to syslog\n"
-	   " -h print usage\n");
-}
-
-/*
- * create a socket with the specified number of buffers
- */
-dsaX_sock_t * dsaX_init_sock ()
-{
-  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
-  assert(b != NULL);
-
-  b->bufsz = sizeof(char) * UDP_PAYLOAD;
-
-  b->buf = (char *) malloc (b->bufsz);
-  assert(b->buf != NULL);
-
-  b->have_packet = 0;
-  b->fd = 0;
-
-  return b;
-}
-
-void dsaX_free_sock(dsaX_sock_t* b)
-{
-  b->fd = 0;
-  b->bufsz = 0;
-  b->have_packet =0;
-  if (b->buf)
-    free (b->buf);
-  b->buf = 0;
-}
-
-/* 
- *  intialize UDP receiver resources
- */
-int dsaX_udpdb_init_receiver (udpdb_t * ctx)
-{
-  syslog(LOG_INFO,"dsax_udpdb_init_receiver()");
-
-  // create a dsaX socket which can hold variable num of UDP packet
-  ctx->sock = dsaX_init_sock();
-
-  ctx->ooo_packets = 0;
-  ctx->recv_core = -1;
-  ctx->n_sleeps = 0;
-  ctx->mb_rcv_ps = 0;
-  ctx->mb_drp_ps = 0;
-  ctx->block_open = 0;
-  ctx->block_count = 0;
-  ctx->capture_started = 0;
-  ctx->last_seq = 0;
-  ctx->last_byte = 0;
-  ctx->block_start_byte = 0;
-
-  // allocate required memory strucutres
-  ctx->packets = init_stats_t();
-  ctx->bytes   = init_stats_t();
-
-  syslog(LOG_INFO,"receiver inited");
-  
-  return 0;
-}
-
-/* 
-prepare socket and writer
-*/
-
-int dsaX_udpdb_prepare (udpdb_t * ctx)
-{
-  syslog(LOG_INFO, "dsaX_udpdb_prepare()");
-
-  // open socket
-  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
-  ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose);
-  if (ctx->sock->fd < 0) {
-    syslog (LOG_ERR, "Error, Failed to create udp socket");
-    return -1;
-  }
-
-  
-  // set the socket size to 256 MB
-  int sock_buf_size = 4*1024*1024;
-  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
-  dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size);
-
-  // set the socket to non-blocking
-  syslog(LOG_INFO, "prepare: setting non_block");
-  sock_nonblock(ctx->sock->fd);
-
-  // clear any packets buffered by the kernel
-  syslog(LOG_INFO, "prepare: clearing packets at socket");
-  size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD);
-
-  // setup the next_seq to the initial value
-  //ctx->last_seq = 0;
-  //ctx->last_byte = 0;
-  //ctx->n_sleeps = 0;
-
-  return 0;
-}
-
-/*
- *  reset receiver before an observation commences
- */
-void dsaX_udpdb_reset_receiver (udpdb_t * ctx) 
-{
-  syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()");
-
-  ctx->capture_started = 0;
-  ctx->last_seq = 0;
-  ctx->last_byte = 0;
-  ctx->n_sleeps = 0;
-
-  reset_stats_t(ctx->packets);
-  reset_stats_t(ctx->bytes);
-}
-
-/* 
- *  open a data block buffer ready for direct access
- */
-int dsaX_udpdb_open_buffer (udpdb_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
-
-  if (ctx->block_open)
-  {
-    syslog (LOG_ERR, "open_buffer: buffer already opened");
-    return -1;
-  }
-
-  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
-
-  uint64_t block_id = 0;
-
-  ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
-  if (!ctx->block)
-  { 
-    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
-    return -1;
-  }
-
-  ctx->block_open = 1;
-  ctx->block_count = 0;
-
-  return 0;
-}
-
-/*
- *  close a data buffer, assuming a full block has been written
- */
-int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
-
-  if (!ctx->block_open)
-  { 
-    syslog (LOG_ERR, "close_buffer: buffer already closed");
-    return -1;
-  }
-
-  // log any buffers that are not full, except for the 1 byte "EOD" buffer
-  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
-    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
-              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
-              bytes_written, ctx->hdu_bufsz);
-
-  if (eod)
-  {
-    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
-      return -1;
-    }
-  }
-  else 
-  {
-    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
-      return -1;
-    }
-  }
-
-  ctx->block = 0;
-  ctx->block_open = 0;
-
-  return 0;
-}
-
-/* 
- *  move to the next ring buffer element. return pointer to base address of new buffer
- */
-int dsaX_udpdb_new_buffer (udpdb_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
-
-  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
-    return -1;
-  }
-
-  if (dsaX_udpdb_open_buffer (ctx) < 0) 
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-  // increment buffer byte markers
-  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
-  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
-
-  // set block to 0
-  //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte);
-  
-  if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", 
-             ctx->block_start_byte, ctx->block_end_byte);
-
-  return 0;
-
-}
-
-/* 
- *  destroy UDP receiver resources 
- */
-int dsaX_udpdb_destroy_receiver (udpdb_t * ctx)
-{
-  if (ctx->sock)
-    dsaX_free_sock(ctx->sock);
-  ctx->sock = 0;
-}
-
-/*
- * Close the udp socket and file
- */
-
-int udpdb_stop_function (udpdb_t* ctx)
-{
-
-  syslog(LOG_INFO, "stop: dada_hdu_unlock_write()");
-  if (dada_hdu_unlock_write (ctx->hdu) < 0)
-  {
-    syslog (LOG_ERR, "stop: could not unlock write on");
-    return -1;
-  }
-
-  // close the UDP socket
-  close(ctx->sock->fd);
-
-  if (ctx->packets->dropped)
-  {
-    double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte;
-    percent *= 100;
-
-    syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %",
-             ctx->bytes->dropped, ctx->last_byte, percent);
-  }
-
-  return 0;
-}
-
-
-
-
-/* --------- THREADS -------- */
-
-// STATS THREAD
-
-/* 
- *  Thread to print simple capture statistics
- */
-void stats_thread(void * arg) {
-
-  /*  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = 4;
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-  */
-  
-  udpdb_t * ctx = (udpdb_t *) arg;
-  uint64_t b_rcv_total = 0;
-  uint64_t b_rcv_1sec = 0;
-  uint64_t b_rcv_curr = 0;
-
-  uint64_t b_drp_total = 0;
-  uint64_t b_drp_1sec = 0;
-  uint64_t b_drp_curr = 0;
-
-  uint64_t s_rcv_total = 0;
-  uint64_t s_rcv_1sec = 0;
-  uint64_t s_rcv_curr = 0;
-
-  uint64_t ooo_pkts = 0;
-  float gb_rcv_ps = 0;
-  float mb_rcv_ps = 0;
-  float mb_drp_ps = 0;
-
-  syslog(LOG_INFO,"stats_thread: starting loop");
-  
-  while (!quit_threads)
-  {
-
-    /* get a snapshot of the data as quickly as possible */
-    b_rcv_curr = ctx->bytes->received;
-    b_drp_curr = ctx->bytes->dropped;
-    s_rcv_curr = ctx->n_sleeps;
-    
-    /* calc the values for the last second */
-    b_rcv_1sec = b_rcv_curr - b_rcv_total;
-    b_drp_1sec = b_drp_curr - b_drp_total;
-    s_rcv_1sec = s_rcv_curr - s_rcv_total;
-
-    /* update the totals */
-    b_rcv_total = b_rcv_curr;
-    b_drp_total = b_drp_curr;
-    s_rcv_total = s_rcv_curr;
-
-    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
-    mb_drp_ps = (double) b_drp_1sec / 1000000;
-    gb_rcv_ps = b_rcv_1sec * 8;
-    gb_rcv_ps /= 1000000000;
-
-    /* determine how much memory is free in the receivers */
-    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped 0", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq);
-
-    sleep(1);
-  }
-
-}
-
-
-
-
-
-
-
-// CONTROL THREAD
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = cPort;
-  char sport[10];
-  sprintf(sport,"%d",port);
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-
-    // INTERPRET BUFFER STRING
-    // receive either UTC_START, UTC_STOP, MONITOR
-
-    // interpret buffer string
-    char * rest = buffer;
-    char *cmd, *val;
-    cmd = strtok_r(rest, "-", &rest);
-    val = strtok_r(rest, "-", &rest);
-    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
-
-    if (strcmp(cmd,"UTC_START")==0)
-      UTC_START = strtoull(val,&endptr,0);
-
-    if (strcmp(cmd,"UTC_STOP")==0)
-      UTC_STOP = strtoull(val,&endptr,0);    
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-	    
-// MAIN of program
-	
-int main (int argc, char *argv[]) {
-
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_capture", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit for writing */
-  dada_hdu_t* hdu_out = 0;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // input data block HDU key
-  key_t out_key = CAPTURE_BLOCK_KEY;
-
-  // command line arguments
-  int core = -1;
-  int arg=0;
-  char dada_fnam[200]; // filename for dada header
-  char iface[100]; // IP for data packets
-  
-  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {	      
-	      strcpy(iP,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'p':
-	  if (optarg)
-	    {	      
-	      dPort = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-p flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'q':
-	  if (optarg)
-	    {	      
-	      cPort = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-q flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {	      
-	      strcpy(iface,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'f':
-	  if (optarg)
-	    {	      
-	      strcpy(dada_fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	 
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // record STATE info
-  sprintf(STATE,"NOBUFFER");
-
-  // START THREADS
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id, stats_thread_id;
-  if (DEBUG)
-    syslog (LOG_DEBUG, "Creating threads");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort);
-
-  // start the stats thread
-  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started stats_thread()");
-
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  // initialize the data structure
-  syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()");
-  if (dsaX_udpdb_init_receiver (&udpdb) < 0)
-  {
-    syslog (LOG_ERR, "could not initialize receiver");
-    return EXIT_FAILURE;
-  }
-  
-  
-  // OPEN CONNECTION TO DADA DB FOR WRITING
-
-  if (DEBUG) syslog(LOG_INFO,"Creating HDU");
-  
-  hdu_out  = dada_hdu_create (0);
-  if (DEBUG) syslog(LOG_INFO,"Created hdu");
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog(LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (DEBUG) syslog(LOG_INFO,"Connected HDU");
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_out);
-    syslog(LOG_ERR,"could not lock to output dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  syslog(LOG_INFO,"opened connection to output DB");
-
-  // DEAL WITH DADA HEADER
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-  if (DEBUG) syslog(LOG_INFO,"read header2");
-
-  if (fileread (dada_fnam, hout, 4096) < 0)
-    {
-      free (hout);
-      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
-      return (EXIT_FAILURE);
-    }
-
-  
-  if (DEBUG) syslog(LOG_INFO,"read header3");
-
-  
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-
-  
-  // copy the in header to the out header
-  memcpy (header_out, hout, 4096);
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
-    {
-      syslog(LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  sprintf(STATE,"LISTEN");
-  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
-
-
-  /* time to start up receiver. 
-     data are captured on iface:CAPTURE_PORT 
-  */
-
-  
-  // put information in udpdb struct
-  udpdb.hdu = hdu_out;
-  udpdb.port = dPort;
-  udpdb.interface = strdup(iface);
-  udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  // determine number of packets per block, must 
-  if (udpdb.hdu_bufsz % UDP_DATA != 0)
-  {
-    syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA);
-    return EXIT_FAILURE;
-  }
-  udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA;  
-  udpdb.bytes_to_acquire = 0;
-  udpdb.num_inputs = NSNAPS;
-
-  // prepare the socket
-  syslog(LOG_INFO, "main: dsaX_udpdb_prepare()");
-  if (dsaX_udpdb_prepare (&udpdb) < 0)
-  {
-    syslog(LOG_ERR, "could allocate required resources (prepare)");
-    return EXIT_FAILURE;
-  }
-  
-  // reset the receiver
-  syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()");
-  dsaX_udpdb_reset_receiver (&udpdb);
-
-  // open a block of the data block, ready for writing
-  if (dsaX_udpdb_open_buffer (&udpdb) < 0)
-  {
-    syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-  
-  /* START WHAT WAS in RECV THREAD */
-
-  // DEFINITIONS
-
-  // lookup table for ant order
-  uint64_t ant_lookup[100], vv;
-  for (int i=0;i<100;i++) ant_lookup[i] = 0;
-  for (int i=0;i<NSNAPS/2;i++) {
-    for (int j=0;j<2;j++) {
-      vv = (i*2+j)*3;
-      ant_lookup[vv] = (uint64_t)(i);
-    }
-  }
-  
-  int unhappies_ct = 0;
-  int unhappy = 0;
-  uint64_t act_seq_no = 0;
-  uint64_t block_seq_no = 0;
-  uint64_t seq_no = 0;
-  uint64_t ch_id = 0;
-  uint64_t ant_id = 0, aid;
-  unsigned char * b = (unsigned char *) udpdb.sock->buf;
-  size_t got = 0; // data received from a recv_from call
-  int errsv; // determine the sequence number boundaries for curr and next buffers
-  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
-  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
-  // for "saving" out of order packets near edges of blocks
-  unsigned int temp_idx = 0;
-  unsigned int temp_max = 1000;
-  char ** temp_buffers; //[temp_max][UDP_DATA];
-  uint64_t * temp_seq_byte;
-  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
-  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
-  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
-  unsigned i = 0;
-  uint64_t timeouts = 0;
-  uint64_t timeout_max = 1000000000;
-  int canWrite = 0;
-  int ct_snaps=0;
-
-  // infinite loop to receive packets
-  // use stats thread to monitor STATE at this stage, to save resources here
-
-  while (1)
-    {
-
-      udpdb.sock->have_packet = 0; 
-
-      // incredibly tight loop to try and get a packet
-      while (!udpdb.sock->have_packet)
-	{
-	 
-	  // receive 1 packet into the socket buffer
-	  got = recvfrom ( udpdb.sock->fd, udpdb.sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
-
-	  if (got == UDP_PAYLOAD) 
-	    {
-	      udpdb.sock->have_packet = 1;
-	    } 
-	  else if (got == -1) 
-	    {
-	      errsv = errno;
-	      if (errsv == EAGAIN) 
-		{
-		  udpdb.n_sleeps++;
-		  if (udpdb.capture_started)
-		    timeouts++;
-		  if (timeouts > timeout_max)
-		    syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
-		}
-	      else 
-		{
-		  syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
-		  return EXIT_FAILURE;
-		}
-	    } 
-	  else // we received a packet of the WRONG size, ignore it
-	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
-	    }
-	}
-      timeouts = 0;
-
-      // we have a valid packet within the timeout
-      if (udpdb.sock->have_packet) 
-	{
-
-	  // decode packet header (64 bits)
-	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
-	  seq_no = 0;
-	  seq_no |=  (((uint64_t)(udpdb.sock->buf[4]) & 224) >> 5) & 7;
-	  //seq_no &= 7;
-	  seq_no |=  (((uint64_t)(udpdb.sock->buf[3])) << 3) & 2040;
-	  //seq_no &= 2047;
-	  seq_no |=  (((uint64_t)(udpdb.sock->buf[2])) << 11) & 522240;
-	  //seq_no &= 524287;
-	  seq_no |=  (((uint64_t)(udpdb.sock->buf[1])) << 19) & 133693440;
-	  //seq_no &= 134217727;
-	  seq_no |=  (((uint64_t)(udpdb.sock->buf[0])) << 27) & 34225520640;
-	  //seq_no &= 34359738367;
-	  /*seq_no = 0;
-	  seq_no |= 224 >> 5;
-	  seq_no |= 255 << 3;
-	  seq_no |= 255 << 11;
-	  seq_no |= 255 << 19;*/
-	  
-	  /*ch_id = 0;
-	  ch_id |= ((unsigned char) (udpdb.sock->buf[4]) & 31) << 8;
-	  ch_id |= (unsigned char) (udpdb.sock->buf[5]);*/
-
-	  ant_id = 0;
-	  ant_id |= (unsigned char) (udpdb.sock->buf[6]) << 8;
-	  ant_id |= (unsigned char) (udpdb.sock->buf[7]);
-	  aid = ant_lookup[(int)(ant_id)];
-
-	  if (UTC_START==0) UTC_START = seq_no + 10000;
-	  
-	  //act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3 + (ch_id-CHOFF)/384; // actual seq no
-	  act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no
-	  block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block
-
-	  // check for starting or stopping condition, using continue
-	  //if (DEBUG) printf("%"PRIu64" %"PRIu64" %d\n",seq_no,act_seq_no,ch_id);//syslog(LOG_DEBUG, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id);
-	  //if (seq_no == UTC_START && UTC_START != 10000 && ant_id == 0) canWrite=1;
-	  if (canWrite==0) {
-	    if (seq_no >= UTC_START-5 && UTC_START != 10000) ct_snaps++;
-	    if (ct_snaps >= 32) canWrite=1;
-	  }
-	  //if (seq_no > UTC_START && UTC_START != 10000) canWrite=1;	  
-	  udpdb.last_seq = seq_no;
-	  //syslog(LOG_INFO,"SEQ_NO_DBG %"PRIu64"",seq_no);
-	  if (canWrite == 0) continue;
-	  //if (seq_no == UTC_STOP) canWrite=0;
-	  //if (udpdb.packets->received<100) syslog(LOG_INFO, "seq_byte=%"PRIu64", num_inputs=%d, seq_no=%"PRIu64", ant_id =%"PRIu64", ch_id =%"PRIu64"",seq_byte,udpdb.num_inputs,seq_no,ant_id, ch_id);
-	  
-	  // if first packet
-	  if (!udpdb.capture_started)
-	    {
-	      //udpdb.block_start_byte = act_seq_no * UDP_DATA;
-	      udpdb.block_start_byte = block_seq_no * UDP_DATA;
-	      udpdb.block_end_byte   = (udpdb.block_start_byte + udpdb.hdu_bufsz) - UDP_DATA;
-	      udpdb.capture_started = 1;
-
-	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb.block_start_byte, udpdb.block_end_byte);
-	    }
-
-	  // if capture running
-	  if (udpdb.capture_started)
-	    {
-	      seq_byte = (act_seq_no * UDP_DATA);	      
-
-	      udpdb.last_byte = seq_byte;
-	      
-	      // if packet arrived too late, ignore
-	      if (seq_byte < udpdb.block_start_byte)
-		{
-		  //syslog (LOG_INFO, "receive_obs: seq_byte < block_start_byte: %"PRIu64", %"PRIu64"", seq_no, ant_id);
-		  udpdb.packets->dropped++;
-		  udpdb.bytes->dropped += UDP_DATA;
-		}
-	      else
-		{
-		  // packet belongs in this block
-		  if (seq_byte <= udpdb.block_end_byte)
-		    {
-		      byte_offset = seq_byte - udpdb.block_start_byte;
-		      memcpy (udpdb.block + byte_offset, udpdb.sock->buf + UDP_HEADER, UDP_DATA);
-		      udpdb.packets->received++;
-		      udpdb.bytes->received += UDP_DATA;
-		      udpdb.block_count++;
-		    }
-		  // packet belongs in subsequent block
-		  else
-		    {
-		      //syslog (LOG_INFO, "receive_obs: received packet for subsequent buffer: temp_idx=%d, ant_id=%d, seq_no=%"PRIu64"",temp_idx,ant_id,seq_no);
-		      
-		      if (temp_idx < temp_max)
-			{
-			  // save packet to temp buffer
-			  memcpy (temp_buffers[temp_idx], udpdb.sock->buf + UDP_HEADER, UDP_DATA);
-			  temp_seq_byte[temp_idx] = seq_byte;
-			  temp_idx++;
-			}
-		      else
-			{
-			  udpdb.packets->dropped++;
-			  udpdb.bytes->dropped += UDP_DATA;
-			}
-		    }
-		}
-	    }
-
-	  // now check for a full buffer or full temp queue
-	  if ((udpdb.block_count >= udpdb.packets_per_buffer) || (temp_idx >= temp_max))
-	    {
-	      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
-		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
-		      "temp_idx=%d\n", seq_no, ant_id,  udpdb.block_count, 
-		      temp_idx);
-	      
-	      uint64_t dropped = udpdb.packets_per_buffer - udpdb.block_count;
-	      if (dropped)
-		{
-		  udpdb.packets->dropped += dropped;
-		  udpdb.bytes->dropped += (dropped * UDP_DATA);
-		}
-
-	      if (dropped>1000) unhappies_ct++;
-
-	      // get a new buffer and write any temp packets saved 
-	      if (dsaX_udpdb_new_buffer (&udpdb) < 0)
-		{
-		  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-		  return EXIT_FAILURE;
-		}
-
-	      if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb.block_start_byte, udpdb.block_end_byte);
-  
-	      // include any futuristic packets we saved
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - udpdb.block_start_byte;
-		  if (byte_offset < udpdb.hdu_bufsz)
-		    {
-		      memcpy (udpdb.block + byte_offset, temp_buffers[i], UDP_DATA);
-		      udpdb.block_count++;
-		      udpdb.packets->received++;
-		      udpdb.bytes->received += UDP_DATA;
-		    }
-		  else
-		    {
-		      udpdb.packets->dropped++;
-		      udpdb.bytes->dropped += UDP_DATA;
-		    }
-		}
-	      temp_idx = 0;
-	    }
-	}
-
-      // packet has been inserted or saved by this point
-      udpdb.sock->have_packet = 0;
-
-      // deal with unhappy receiver
-      if (unhappies_ct > unhappies) {
-
-	syslog(LOG_INFO, "Skipping some blocks...");
-
-	close(udpdb.sock->fd);
-
-	for (int i=0;i<skips;i++) {
-
-	  udpdb.packets->dropped += udpdb.packets_per_buffer;
-	  udpdb.bytes->dropped += (udpdb.packets_per_buffer * UDP_DATA);
-
-	  if (dsaX_udpdb_new_buffer (&udpdb) < 0)
-	    {
-	      syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-	      return EXIT_FAILURE;
-	    }
-
-	}
-
-	sleep(sleeps);
-
-	// prepare the socket
-	syslog(LOG_INFO, "re-preparing the socket dsaX_udpdb_prepare()");
-	if (dsaX_udpdb_prepare (&udpdb) < 0)
-	  {
-	    syslog(LOG_ERR, "could allocate required resources (prepare)");
-	    return EXIT_FAILURE;
-	  }	
-	
-	unhappies_ct = 0;
-
-      }
-      
-    }
-
-  /* END WHAT WAS IN RECV THREAD */
-  
-
-  // close threads
-  syslog(LOG_INFO, "joining control_thread and stats_thread");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  pthread_join (stats_thread_id, &result);
-
-  free(temp_seq_byte);
-  free(temp_buffers);
-  
-  dsaX_dbgpu_cleanup (hdu_out);
-
-}
diff --git a/src/dsaX_capture_manythread.c b/src/dsaX_capture_manythread.c
deleted file mode 100644
index b9f14bd..0000000
--- a/src/dsaX_capture_manythread.c
+++ /dev/null
@@ -1,1115 +0,0 @@
-/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
-
-main: runs capture loop, and interfaces dada buffer
-control_thread: deals with control commands
-
-*/
-
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture_manythread.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int dPort, cPort;
-int quit_threads = 0;
-char STATE[20];
-uint64_t UTC_START = 10000;
-uint64_t UTC_STOP = 40000000000;
-int MONITOR = 0;
-char iP[100];
-int DEBUG = 0;
-int HISTOGRAM[16];
-int writeBlock = 0;
-const int nth = 4;
-const int nwth = 2;
-int cores[16] = {10,12,11,13,30,31,32,33};
-int write_cores[8] = {14,15,34,35};
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-volatile int blockStatus[64];
-volatile int skipBlock = 0;
-volatile int skipping = 0;
-volatile int lWriteBlock = 0;
-volatile int write_ct = 0;
-volatile uint64_t last_seq = 0;
-volatile int skipct = 0;
-volatile uint64_t block_count = 0;
-volatile uint64_t block_start_byte=0, block_end_byte=0;
-volatile  unsigned capture_started = 0;
-volatile char * wblock;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-void usage();
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_capture [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -j IP to listen on for data packets [no default]\n"
-	   " -i IP to listen on for control commands [no default]\n"
-	   " -p PORT for data\n"
-	   " -q PORT for control\n"
-	   " -f filename of template dada header [no default]\n"
-	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -d send debug messages to syslog\n"
-	   " -g chgroup [default 0]\n"
-	   " -h print usage\n");
-}
-
-// open a socket
-dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx);
-dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx)
-{
-
-  // prepare structure
-  syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure");
-  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
-  assert(b != NULL);
-  b->bufsz = sizeof(char) * UDP_PAYLOAD;
-  b->buf = (char *) malloc (b->bufsz);
-  assert(b->buf != NULL);
-  b->have_packet = 0;
-  b->fd = 0;
-
-  // connect to socket
-  syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, dPort);
-
-  // open socket
-  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, dPort);
-  b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
-  assert(b->fd>=0);
-
-  // for multiple connections
-  int one = 1;
-  setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one));
-  
-  struct sockaddr_in udp_sock;
-  bzero(&(udp_sock.sin_zero), 8);                     // clear the struct
-  udp_sock.sin_family = AF_INET;                      // internet/IP
-  udp_sock.sin_port = htons(dPort);                    // set the port number
-  udp_sock.sin_addr.s_addr = inet_addr(ctx->interface);  // from a specific IP address 
-
-  if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) {
-    syslog(LOG_ERR, "prepare: failed to bind to socket");
-    return -1;
-  }
-  
-  // set the socket size to 64 MB
-  int sock_buf_size = 64*1024*1024;
-  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
-  dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size);
-
-  // set the socket to non-blocking
-  syslog(LOG_INFO, "prepare: setting non_block");
-  sock_nonblock(b->fd);
-
-  // clear any packets buffered by the kernel
-  syslog(LOG_INFO, "prepare: clearing packets at socket");
-  size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD);
-
-  // clear blockStatus
-  for (int i=0;i<64;i++) blockStatus[i] = 0;
-
-  return b;
-}
-
-
-
-// close a socket
-void dsaX_free_sock(dsaX_sock_t* b);
-void dsaX_free_sock(dsaX_sock_t* b)
-{
-  b->fd = 0;
-  b->bufsz = 0;
-  b->have_packet =0;
-  if (b->buf)
-    free (b->buf);
-  b->buf = 0;
-}
-
-/* 
- *  open a data block buffer ready for direct access
- */
-int dsaX_udpdb_open_buffer (dsaX_write_t * ctx);
-int dsaX_udpdb_open_buffer (dsaX_write_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
-
-  if (ctx->block_open)
-  {
-    syslog (LOG_ERR, "open_buffer: buffer already opened");
-    return -1;
-  }
-
-  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
-
-  uint64_t block_id = 0;
-
-  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
-  if (!wblock)
-  { 
-    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
-    return -1;
-  }
-
-  ctx->block_open = 1;
-
-  return 0;
-}
-
-/*
- *  close a data buffer, assuming a full block has been written
- */
-int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod);
-int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
-
-  if (!ctx->block_open)
-  { 
-    syslog (LOG_ERR, "close_buffer: buffer already closed");
-    return -1;
-  }
-
-  // log any buffers that are not full, except for the 1 byte "EOD" buffer
-  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
-    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
-              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
-              bytes_written, ctx->hdu_bufsz);
-
-  if (eod)
-  {
-    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
-      return -1;
-    }
-  }
-  else 
-  {
-    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
-      return -1;
-    }
-  }
-
-  wblock = 0;
-  ctx->block_open = 0;
-
-  return 0;
-}
-
-/* 
- *  move to the next ring buffer element. return pointer to base address of new buffer
- */
-int dsaX_udpdb_new_buffer (dsaX_write_t * ctx);
-int dsaX_udpdb_new_buffer (dsaX_write_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
-
-  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
-    return -1;
-  }
-
-  if (dsaX_udpdb_open_buffer (ctx) < 0) 
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-  return 0;
-
-}
-
-// increment counters when block is full
-void dsaX_udpdb_increment (udpdb_t * ctx);
-void dsaX_udpdb_increment (udpdb_t * ctx)
-{
-
-  // increment buffer byte markers
-  writeBlock++;
-  block_start_byte = block_end_byte + UDP_DATA;
-  block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
-  block_count = 0;
-
-}
-
-
-
-/* --------- THREADS -------- */
-
-// STATS THREAD
-
-/* 
- *  Thread to print simple capture statistics
- */
-void stats_thread(void * arg) {
-  
-  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
-  uint64_t b_rcv_total = 0;
-  uint64_t b_rcv_1sec = 0;
-  uint64_t b_rcv_curr = 0;
-
-  uint64_t b_drp_total = 0;
-  uint64_t b_drp_1sec = 0;
-  uint64_t b_drp_curr = 0;
-
-  uint64_t s_rcv_total = 0;
-  uint64_t s_rcv_1sec = 0;
-  uint64_t s_rcv_curr = 0;
-
-  uint64_t ooo_pkts = 0;
-  float gb_rcv_ps = 0;
-  float mb_rcv_ps = 0;
-  float mb_drp_ps = 0;
-
-  syslog(LOG_INFO,"starting stats thread...");
-  sleep(2);
-  syslog(LOG_INFO,"started stats thread...");
-  
-  while (!quit_threads)
-  {
-
-    /* get a snapshot of the data as quickly as possible */
-    b_rcv_curr = ctx->bytes->received;
-    b_drp_curr = ctx->bytes->dropped;
-    
-    /* calc the values for the last second */
-    b_rcv_1sec = b_rcv_curr - b_rcv_total;
-    b_drp_1sec = b_drp_curr - b_drp_total;
-
-    /* update the totals */
-    b_rcv_total = b_rcv_curr;
-    b_drp_total = b_drp_curr;
-
-    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
-    mb_drp_ps = (double) b_drp_1sec / 1000000;
-    gb_rcv_ps = b_rcv_1sec * 8;
-    gb_rcv_ps /= 1000000000;    
-
-    /* determine how much memory is free in the receivers */
-    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct);
-
-    sleep(1);
-  }
-
-}
-
-// CONTROL THREAD
-
-void control_thread (void * arg) {
-
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = cPort;
-  char sport[10];
-  sprintf(sport,"%d",port);
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-
-    // INTERPRET BUFFER STRING
-    // receive either UTC_START, UTC_STOP, MONITOR
-
-    // interpret buffer string
-    char * rest = buffer;
-    char *cmd, *val;
-    cmd = strtok_r(rest, "-", &rest);
-    val = strtok_r(rest, "-", &rest);
-    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
-
-    if (strcmp(cmd,"UTC_START")==0)
-      UTC_START = strtoull(val,&endptr,0);
-
-    if (strcmp(cmd,"UTC_STOP")==0)
-      UTC_STOP = strtoull(val,&endptr,0);    
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-
-/* 
- *  Thread to capture data
- */
-int recv_thread(void * arg) {
-
-  udpdb_t * udpdb = (udpdb_t *) arg;
-  int thread_id = udpdb->thread_id;
-    
-  // set affinity
-  const pthread_t pid = pthread_self();
-  int core_id;
-  if (dPort==4011)
-    core_id = cores[thread_id];
-  else
-    core_id = cores[thread_id+nth];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-
-  // set up socket
-  dsaX_sock_t * sock = dsaX_make_sock(udpdb);
-
-    // lookup table for ant order
-  uint64_t ant_lookup[100], vv;
-  for (int i=0;i<100;i++) ant_lookup[i] = 0;
-  for (int i=0;i<NSNAPS/2;i++) {
-    for (int j=0;j<2;j++) {
-      vv = (i*2+j)*3;
-      ant_lookup[vv] = (uint64_t)(i);
-    }
-  }
-
-  
-  // DEFINITIONS
-  uint64_t tpack = 0;
-  uint64_t act_seq_no = 0;
-  uint64_t block_seq_no = 0;
-  uint64_t seq_no = 0;
-  uint64_t ant_id = 0, aid;
-  unsigned char * b = (unsigned char *) sock->buf;
-  size_t got = 0; // data received from a recv_from call
-  int errsv; // determine the sequence number boundaries for curr and next buffers
-  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
-  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
-  // for "saving" out of order packets near edges of blocks
-  unsigned int temp_idx = 0;
-  unsigned int temp_max = 500;
-  char ** temp_buffers;
-  uint64_t * temp_seq_byte;
-  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
-  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
-  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
-  unsigned i = 0;
-  uint64_t timeouts = 0;
-  uint64_t timeout_max = 1000000000;
-  int canWrite = 0;
-  int ct_snaps=0;
-  int mod_WB;
-  int ctAnts = 0;
-
-  // infinite loop to receive packets
-
-  while (!quit_threads)
-    {
-
-      sock->have_packet = 0; 
-
-      // incredibly tight loop to try and get a packet
-      while (!sock->have_packet)
-	{
-	 
-	  // receive 1 packet into the socket buffer
-	  got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
-
-	  if (got == UDP_PAYLOAD) 
-	    {
-	      sock->have_packet = 1;
-	    } 
-	  else if (got == -1) 
-	    {
-	      errsv = errno;
-	      if (errsv == EAGAIN) 
-		{
-		  if (capture_started)
-		    timeouts++;
-		  //if (timeouts > timeout_max)
-		  //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
-		}
-	      else 
-		{
-		  //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
-		  return EXIT_FAILURE;
-		}
-	    } 
-	  else // we received a packet of the WRONG size, ignore it
-	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
-	    }
-	}
-      timeouts = 0;
-
-      // we have a valid packet within the timeout
-      if (sock->have_packet) 
-	{
-
-	  // decode packet header (64 bits)
-	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
-	  seq_no = 0;
-	  seq_no |=  (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7;
-	  seq_no |=  (((uint64_t)(sock->buf[3])) << 3) & 2040;
-	  seq_no |=  (((uint64_t)(sock->buf[2])) << 11) & 522240;
-	  seq_no |=  (((uint64_t)(sock->buf[1])) << 19) & 133693440;
-	  seq_no |=  (((uint64_t)(sock->buf[0])) << 27) & 34225520640;
-	  ant_id = 0;
-	  ant_id |= (unsigned char) (sock->buf[6]) << 8;
-	  ant_id |= (unsigned char) (sock->buf[7]);
-	  aid = ant_lookup[(int)(ant_id)];
-	  //aid = ant_id/3;
-	  
-	  if (UTC_START==0) UTC_START = seq_no+30000;
-	  
-	  act_seq_no = seq_no*NSNAPS/4 + aid; // actual seq no
-	  block_seq_no = UTC_START*NSNAPS/4; // seq no corresponding to ant 0 and start of block
-
-	  // set shared last_seq
-	  pthread_mutex_lock(&mutex);
-	  last_seq = seq_no;
-	  //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq);
-	  pthread_mutex_unlock(&mutex);
-	  
-	  // check for starting or stopping condition, using continue
-	  if (canWrite==0) {
-	    if (seq_no >= UTC_START-50 && UTC_START != 10000) {
-	      canWrite=1;	      
-	    }
-	  }
-	  if (canWrite == 0) continue;
-
-	  // threadsafe start of capture
-	  pthread_mutex_lock(&mutex);
-	  if (!(capture_started))
-	    {
-	      block_start_byte = block_seq_no * UDP_DATA;
-	      block_end_byte   = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
-	      capture_started = 1;
-
-	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte);
-	    }
-	  pthread_mutex_unlock(&mutex);
-
-	  // if capture running
-	  if (capture_started)
-	    {
-	      seq_byte = (act_seq_no * UDP_DATA);
-	      tpack++;
-	      
-	      // packet belongs in this block
-	      if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte))
-		{
-		  byte_offset = seq_byte - (block_start_byte);
-		  mod_WB = writeBlock % 64;
-		  memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA);		  
-		  pthread_mutex_lock(&mutex);		  
-		  block_count++;
-		  //syslog(LOG_INFO,"block count %"PRIu64"",block_count);
-		  pthread_mutex_unlock(&mutex);
-		  
-		}
-	      // packet belongs in subsequent block
-	      else if (seq_byte > block_end_byte)
-		{
-		      
-		  if (temp_idx < temp_max)
-		    {
-		      // save packet to temp buffer
-		      memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA);
-		      temp_seq_byte[temp_idx] = seq_byte;
-		      temp_idx++;
-		    }
-		}
-	      // packet is too late
-	      /*else
-		{
-		  if (ctAnts<100) {
-		    syslog (LOG_INFO, "receive_obs: TOO LATE %"PRIu64"  %"PRIu64"", seq_no, ant_id);
-		    ctAnts++;
-		  }
-		  }*/
-	    }
-	  
-	  // threadsafe end of block
-	  pthread_mutex_lock(&mutex);
-	  if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
-	    {
-	      syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", "
-		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
-		      "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id,  block_count, 
-		      temp_idx,writeBlock);
-
-	      // write block
-	      // check whether doWrite has been released. If not, skip this block
-	      if (blockStatus[writeBlock % 64] > 0)
-		blockStatus[writeBlock % 64] += 1;
-	      else
-		blockStatus[writeBlock % 64] = 1;
-	      
-	      uint64_t dropped = udpdb->packets_per_buffer - (block_count);
-	      udpdb->packets->received += (block_count);
-	      udpdb->bytes->received += (block_count) * UDP_DATA;	      
-	      if (dropped)
-		{
-		  udpdb->packets->dropped += dropped;
-		  udpdb->bytes->dropped += (dropped * UDP_DATA);
-		}
-
-	      // increment counters
-	      dsaX_udpdb_increment(udpdb);
-	      ctAnts = 0;
-
-	      // write temp queue for this thread
-	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
-	      tpack = 0;
-	
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - (block_start_byte);
-		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
-		    {
-		      mod_WB = writeBlock % 64;
-		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
-		      //pthread_mutex_lock(&mutex);
-		      block_count++;		      
-		      //pthread_mutex_unlock(&mutex);
-		    }
-		}
-	      temp_idx = 0;
-       
-	    }
-	  pthread_mutex_unlock(&mutex);
-
-	  // at this stage, can try and write temp queue safely for other threads
-	  if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0)
-	    {
-	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
-	      tpack = 0;
-	
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - (block_start_byte);
-		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
-		    {
-		      mod_WB = writeBlock % 64;
-		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
-		      pthread_mutex_lock(&mutex);
-		      block_count++;		      
-		      pthread_mutex_unlock(&mutex);
-		    }
-		}
-	      temp_idx = 0;
-
-	    }
-
-	}
-
-      // packet has been inserted or saved by this point
-      sock->have_packet = 0;
-	
-    }
-
-  dsaX_free_sock(sock);
-  free(temp_buffers);
-  free(temp_seq_byte);
-  
-}
-
-/* 
- *  Thread to write data
- */
-void write_thread(void * arg) {
-
-  dsaX_write_t * udpdb = (dsaX_write_t *) arg;
-  int thread_id = udpdb->thread_id;
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  int core_id;
-  if (dPort==4011)
-    core_id = write_cores[thread_id];
-  else
-    core_id = write_cores[thread_id+nwth];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-    
-  int mod_WB = 0;
-  int a;
-  
-  while (!quit_threads)
-  {
-
-    mod_WB = lWriteBlock % 64;
-    
-    while (blockStatus[mod_WB]==0) {
-      a=1;
-    }    
-
-    // assume everything is set up
-    // wblock is assigned, write_ct=0
-        
-    memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz  + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth);
-
-    pthread_mutex_lock(&mutex);
-    write_ct++;
-    pthread_mutex_unlock(&mutex);
-
-    //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id);
-
-    // now wait until thread 0 has finished getting a new block before moving on
-    if (thread_id>0) {
-      while (write_ct!=0) a=1;
-    }
-    else {
-
-      // wait for all sub-blocks to be written
-      while (write_ct<nwth) a=1;
-
-      // get new block
-      if (dsaX_udpdb_new_buffer (udpdb) < 0)
-	{
-	  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-	  return EXIT_FAILURE;
-	}
-
-      syslog(LOG_INFO,"write thread %d: written block... %d",thread_id,lWriteBlock);
-      lWriteBlock++;
-      
-      // update doWrite and skipBlock
-      skipct = 0;
-      for (int i=0;i<64;i++) skipct += blockStatus[i];
-      blockStatus[mod_WB] -= 1;
-      write_ct = 0;
-
-    }
-     
-  }
-
-}
-
-
-	    
-// MAIN of program
-	
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_capture_manythread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit for writing */
-  dada_hdu_t* hdu_out = 0;
-  
-  // input data block HDU key
-  key_t out_key = CAPTURE_BLOCK_KEY;
-
-  // command line arguments
-  int core = -1;
-  int chgroup = 0;
-  int arg=0;
-  char dada_fnam[200]; // filename for dada header
-  char iface[100]; // IP for data packets
-  
-  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:p:q:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {	      
-	      strcpy(iP,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'g':
-	  if (optarg)
-	    {	      
-	      chgroup = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-g flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {	      
-	      strcpy(iface,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'p':
-	  if (optarg)
-	    {
-	      dPort = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-p flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'q':
-	  if (optarg)
-	    {
-	      cPort = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-q flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'f':
-	  if (optarg)
-	    {	      
-	      strcpy(dada_fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	 
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // START THREADS
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  udpdb_t temp_str;
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,cPort);
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  
-  // OPEN CONNECTION TO DADA DB FOR WRITING
-
-  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
-  
-  hdu_out  = dada_hdu_create (0);
-  if (DEBUG) syslog(DEBUG,"Created hdu");
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog(LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_out);
-    syslog(LOG_ERR,"could not lock to output dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  syslog(LOG_INFO,"opened connection to output DB");
-
-  // DEAL WITH DADA HEADER
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-  if (DEBUG) syslog(DEBUG,"read header2");
-
-  if (fileread (dada_fnam, hout, 4096) < 0)
-    {
-      free (hout);
-      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
-      return (EXIT_FAILURE);
-    }
-
-  
-  if (DEBUG) syslog(DEBUG,"read header3");
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // copy the in header to the out header
-  memcpy (header_out, hout, 4096);
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
-    {
-      syslog(LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  sprintf(STATE,"LISTEN");
-  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
-
-
-  /* time to start up receiver. 
-     data are captured on iface:CAPTURE_PORT 
-  */
-
-  // make recv, write, and stats structs  
-  udpdb_t udpdb[nth];
-  dsaX_stats_t stats;
-  dsaX_write_t writey[nwth];
-
-  // shared variables and memory
-  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  char * tblock = (char *)malloc(sizeof(char)*bufsz*64);
-  stats_t * packets = init_stats_t();
-  stats_t * bytes = init_stats_t();
-  reset_stats_t(packets);
-  reset_stats_t(bytes);
-
-  // initialise stats struct
-  stats.packets = packets;
-  stats.bytes = bytes;
-
-  // initialise writey struct and open buffer
-  for (int i=0;i<nwth;i++) {
-    writey[i].hdu = hdu_out;
-    writey[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-    writey[i].block_open = 0;
-    writey[i].tblock = tblock;
-    writey[i].thread_id = i;    
-  }
-  dsaX_udpdb_open_buffer (&writey[0]);
-
-  // initialise all udpdb structs
-  for (int i=0;i<nth;i++) {
-
-    // shared stuff
-    udpdb[i].packets = packets;
-    udpdb[i].bytes = bytes;
-    udpdb[i].tblock = tblock;
-
-    // the rest
-    udpdb[i].port = dPort;
-    udpdb[i].interface = strdup(iface);
-    udpdb[i].hdu_bufsz = bufsz;
-    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
-    udpdb[i].num_inputs = NSNAPS;
-    udpdb[i].verbose = 0;
-    udpdb[i].rcv_sleeps = 0;
-    
-    udpdb[i].thread_id = i;    
-    
-  }
-
-
-  /* start threads */
-    
-  // start the stats thread
-  pthread_t stats_thread_id;
-  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started stats_thread()");
-
-  // start the receive threads
-  pthread_t recv_thread_id[nth];  
-  rval = 0;
-  for (int i=0;i<nth;i++) {
-    rval = pthread_create (&recv_thread_id[i], 0, (void *) recv_thread, (void *) (&udpdb[i]));
-    if (rval != 0) {
-      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "Created recv threads");
-
-  // start the write thread
-  pthread_t write_thread_id[nwth];
-  rval = 0;
-  for (int i=0;i<nwth;i++) {
-    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&writey[i]));
-    if (rval != 0) {
-      syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "started write threads");  
-
-  while (!quit_threads) {
-    sleep(1);
-  }
-  
-  // close threads
-  syslog(LOG_INFO, "joining all threads");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  pthread_join (stats_thread_id, &result);
-  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
-  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
-  
-  free(tblock);
-  dsaX_dbgpu_cleanup (hdu_out);
-
-}
diff --git a/src/dsaX_capture_manythread.c.bak b/src/dsaX_capture_manythread.c.bak
deleted file mode 100644
index e3fd2b6..0000000
--- a/src/dsaX_capture_manythread.c.bak
+++ /dev/null
@@ -1,1053 +0,0 @@
-/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
-
-main: runs capture loop, and interfaces dada buffer
-control_thread: deals with control commands
-
-*/
-
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture_manythread.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int quit_threads = 0;
-char STATE[20];
-uint64_t UTC_START = 10000;
-uint64_t UTC_STOP = 40000000000;
-int MONITOR = 0;
-char iP[100];
-int DEBUG = 0;
-int HISTOGRAM[16];
-int writeBlock = 0;
-const int nth = 8;
-const int nwth = 4;
-int cores[8] = {30,31,32,33,34,35,36,37};
-int write_cores[4] = {17,18,19,39};
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-volatile int doWrite = 0;
-volatile int skipBlock = 0;
-volatile int skipping = 0;
-volatile int lWriteBlock = 0;
-volatile int write_ct = 0;
-volatile uint64_t last_seq = 0;
-volatile int skipct = 0;
-volatile uint64_t block_count = 0;
-volatile uint64_t block_start_byte=0, block_end_byte=0;
-volatile  unsigned capture_started = 0;
-volatile char * wblock;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-void usage();
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_capture [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -j IP to listen on for data packets [no default]\n"
-	   " -i IP to listen on for control commands [no default]\n"	
-	   " -f filename of template dada header [no default]\n"
-	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -d send debug messages to syslog\n"
-	   " -g chgroup [default 0]\n"
-	   " -h print usage\n");
-}
-
-// open a socket
-dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx);
-dsaX_sock_t * dsaX_make_sock (udpdb_t * ctx)
-{
-
-  // prepare structure
-  syslog(LOG_INFO, "dsaX_make_sock(): preparing sock structure");
-  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
-  assert(b != NULL);
-  b->bufsz = sizeof(char) * UDP_PAYLOAD;
-  b->buf = (char *) malloc (b->bufsz);
-  assert(b->buf != NULL);
-  b->have_packet = 0;
-  b->fd = 0;
-
-  // connect to socket
-  syslog(LOG_INFO, "dsaX_make_sock(): connecting to socket %s:%d", ctx->interface, ctx->port);
-
-  // open socket
-  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
-  b->fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
-  assert(b->fd>=0);
-
-  // for multiple connections
-  int one = 1;
-  setsockopt(b->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &one, sizeof(one));
-  
-  struct sockaddr_in udp_sock;
-  bzero(&(udp_sock.sin_zero), 8);                     // clear the struct
-  udp_sock.sin_family = AF_INET;                      // internet/IP
-  udp_sock.sin_port = htons(ctx->port);                    // set the port number
-  udp_sock.sin_addr.s_addr = inet_addr(ctx->interface);  // from a specific IP address 
-
-  if (bind(b->fd, (struct sockaddr *)&udp_sock, sizeof(udp_sock)) == -1) {
-    syslog(LOG_ERR, "prepare: failed to bind to socket");
-    return -1;
-  }
-  
-  // set the socket size to 256 MB
-  int sock_buf_size = 256*1024*1024;
-  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
-  dada_udp_sock_set_buffer_size (ctx->log, b->fd, ctx->verbose, sock_buf_size);
-
-  // set the socket to non-blocking
-  syslog(LOG_INFO, "prepare: setting non_block");
-  sock_nonblock(b->fd);
-
-  // clear any packets buffered by the kernel
-  syslog(LOG_INFO, "prepare: clearing packets at socket");
-  size_t cleared = dada_sock_clear_buffered_packets(b->fd, UDP_PAYLOAD);
-
-  return b;
-}
-
-
-
-// close a socket
-void dsaX_free_sock(dsaX_sock_t* b);
-void dsaX_free_sock(dsaX_sock_t* b)
-{
-  b->fd = 0;
-  b->bufsz = 0;
-  b->have_packet =0;
-  if (b->buf)
-    free (b->buf);
-  b->buf = 0;
-}
-
-/* 
- *  open a data block buffer ready for direct access
- */
-int dsaX_udpdb_open_buffer (dsaX_write_t * ctx);
-int dsaX_udpdb_open_buffer (dsaX_write_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
-
-  if (ctx->block_open)
-  {
-    syslog (LOG_ERR, "open_buffer: buffer already opened");
-    return -1;
-  }
-
-  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
-
-  uint64_t block_id = 0;
-
-  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
-  if (!wblock)
-  { 
-    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
-    return -1;
-  }
-
-  ctx->block_open = 1;
-
-  return 0;
-}
-
-/*
- *  close a data buffer, assuming a full block has been written
- */
-int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod);
-int dsaX_udpdb_close_buffer (dsaX_write_t * ctx, uint64_t bytes_written, unsigned eod)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
-
-  if (!ctx->block_open)
-  { 
-    syslog (LOG_ERR, "close_buffer: buffer already closed");
-    return -1;
-  }
-
-  // log any buffers that are not full, except for the 1 byte "EOD" buffer
-  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
-    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
-              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
-              bytes_written, ctx->hdu_bufsz);
-
-  if (eod)
-  {
-    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
-      return -1;
-    }
-  }
-  else 
-  {
-    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
-      return -1;
-    }
-  }
-
-  wblock = 0;
-  ctx->block_open = 0;
-
-  return 0;
-}
-
-/* 
- *  move to the next ring buffer element. return pointer to base address of new buffer
- */
-int dsaX_udpdb_new_buffer (dsaX_write_t * ctx);
-int dsaX_udpdb_new_buffer (dsaX_write_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
-
-  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
-    return -1;
-  }
-
-  if (dsaX_udpdb_open_buffer (ctx) < 0) 
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-  return 0;
-
-}
-
-// increment counters when block is full
-void dsaX_udpdb_increment (udpdb_t * ctx);
-void dsaX_udpdb_increment (udpdb_t * ctx)
-{
-
-  // increment buffer byte markers
-  writeBlock++;
-  block_start_byte = block_end_byte + UDP_DATA;
-  block_end_byte = block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
-  block_count = 0;
-
-}
-
-
-
-/* --------- THREADS -------- */
-
-// STATS THREAD
-
-/* 
- *  Thread to print simple capture statistics
- */
-void stats_thread(void * arg) {
-  
-  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
-  uint64_t b_rcv_total = 0;
-  uint64_t b_rcv_1sec = 0;
-  uint64_t b_rcv_curr = 0;
-
-  uint64_t b_drp_total = 0;
-  uint64_t b_drp_1sec = 0;
-  uint64_t b_drp_curr = 0;
-
-  uint64_t s_rcv_total = 0;
-  uint64_t s_rcv_1sec = 0;
-  uint64_t s_rcv_curr = 0;
-
-  uint64_t ooo_pkts = 0;
-  float gb_rcv_ps = 0;
-  float mb_rcv_ps = 0;
-  float mb_drp_ps = 0;
-
-  syslog(LOG_INFO,"starting stats thread...");
-  sleep(2);
-  syslog(LOG_INFO,"started stats thread...");
-  
-  while (!quit_threads)
-  {
-
-    /* get a snapshot of the data as quickly as possible */
-    b_rcv_curr = ctx->bytes->received;
-    b_drp_curr = ctx->bytes->dropped;
-    
-    /* calc the values for the last second */
-    b_rcv_1sec = b_rcv_curr - b_rcv_total;
-    b_drp_1sec = b_drp_curr - b_drp_total;
-
-    /* update the totals */
-    b_rcv_total = b_rcv_curr;
-    b_drp_total = b_drp_curr;
-
-    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
-    mb_drp_ps = (double) b_drp_1sec / 1000000;
-    gb_rcv_ps = b_rcv_1sec * 8;
-    gb_rcv_ps /= 1000000000;    
-
-    /* determine how much memory is free in the receivers */
-    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, skipct);
-
-    sleep(1);
-  }
-
-}
-
-// CONTROL THREAD
-
-void control_thread (void * arg) {
-
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = CAPTURE_CONTROL_PORT;
-  char sport[10];
-  sprintf(sport,"%d",port);
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-
-    // INTERPRET BUFFER STRING
-    // receive either UTC_START, UTC_STOP, MONITOR
-
-    // interpret buffer string
-    char * rest = buffer;
-    char *cmd, *val;
-    cmd = strtok_r(rest, "-", &rest);
-    val = strtok_r(rest, "-", &rest);
-    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
-
-    if (strcmp(cmd,"UTC_START")==0)
-      UTC_START = strtoull(val,&endptr,0);
-
-    if (strcmp(cmd,"UTC_STOP")==0)
-      UTC_STOP = strtoull(val,&endptr,0);    
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-
-/* 
- *  Thread to capture data
- */
-void recv_thread(void * arg) {
-
-  udpdb_t * udpdb = (udpdb_t *) arg;
-  int thread_id = udpdb->thread_id;
-    
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-
-  // set up socket
-  dsaX_sock_t * sock = dsaX_make_sock(udpdb);
-  
-  // DEFINITIONS
-  uint64_t tpack = 0;
-  uint64_t act_seq_no = 0;
-  uint64_t block_seq_no = 0;
-  uint64_t seq_no = 0;
-  uint64_t ant_id = 0;
-  unsigned char * b = (unsigned char *) sock->buf;
-  size_t got = 0; // data received from a recv_from call
-  int errsv; // determine the sequence number boundaries for curr and next buffers
-  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
-  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
-  // for "saving" out of order packets near edges of blocks
-  unsigned int temp_idx = 0;
-  unsigned int temp_max = 500;
-  char ** temp_buffers;
-  uint64_t * temp_seq_byte;
-  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
-  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
-  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
-  unsigned i = 0;
-  uint64_t timeouts = 0;
-  uint64_t timeout_max = 1000000000;
-  int canWrite = 0;
-  int ct_snaps=0;
-  int mod_WB;
-
-  // infinite loop to receive packets
-
-  while (!quit_threads)
-    {
-
-      sock->have_packet = 0; 
-
-      // incredibly tight loop to try and get a packet
-      while (!sock->have_packet)
-	{
-	 
-	  // receive 1 packet into the socket buffer
-	  got = recvfrom ( sock->fd, sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
-
-	  if (got == UDP_PAYLOAD) 
-	    {
-	      sock->have_packet = 1;
-	    } 
-	  else if (got == -1) 
-	    {
-	      errsv = errno;
-	      if (errsv == EAGAIN) 
-		{
-		  if (capture_started)
-		    timeouts++;
-		  //if (timeouts > timeout_max)
-		  //syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
-		}
-	      else 
-		{
-		  //syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
-		  return EXIT_FAILURE;
-		}
-	    } 
-	  else // we received a packet of the WRONG size, ignore it
-	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
-	    }
-	}
-      timeouts = 0;
-
-      // we have a valid packet within the timeout
-      if (sock->have_packet) 
-	{
-
-	  // decode packet header (64 bits)
-	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
-	  seq_no = 0;
-	  seq_no |=  (((uint64_t)(sock->buf[4]) & 224) >> 5) & 7;
-	  seq_no |=  (((uint64_t)(sock->buf[3])) << 3) & 2040;
-	  seq_no |=  (((uint64_t)(sock->buf[2])) << 11) & 522240;
-	  seq_no |=  (((uint64_t)(sock->buf[1])) << 19) & 133693440;
-	  seq_no |=  (((uint64_t)(sock->buf[0])) << 27) & 34225520640;
-	  ant_id = 0;
-	  ant_id |= (unsigned char) (sock->buf[6]) << 8;
-	  ant_id |= (unsigned char) (sock->buf[7]);
-	  
-	  act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
-	  block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
-
-	  // set shared last_seq
-	  pthread_mutex_lock(&mutex);
-	  last_seq = seq_no;
-	  //syslog(LOG_INFO,"last_seq %"PRIu64"",last_seq);
-	  pthread_mutex_unlock(&mutex);
-	  
-	  // check for starting or stopping condition, using continue
-	  if (canWrite==0) {
-	    if (seq_no >= UTC_START-50 && UTC_START != 10000) {
-	      canWrite=1;	      
-	    }
-	  }
-	  if (canWrite == 0) continue;
-
-	  // threadsafe start of capture
-	  pthread_mutex_lock(&mutex);
-	  if (!(capture_started))
-	    {
-	      block_start_byte = block_seq_no * UDP_DATA;
-	      block_end_byte   = (block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
-	      capture_started = 1;
-
-	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", block_start_byte, block_end_byte);
-	    }
-	  pthread_mutex_unlock(&mutex);
-
-	  // if capture running
-	  if (capture_started)
-	    {
-	      seq_byte = (act_seq_no * UDP_DATA);
-	      tpack++;
-	      
-	      // packet belongs in this block
-	      if ((seq_byte <= block_end_byte) && (seq_byte >= block_start_byte))
-		{
-		  byte_offset = seq_byte - (block_start_byte);
-		  mod_WB = writeBlock % 64;
-		  memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, sock->buf + UDP_HEADER, UDP_DATA);		  
-		  pthread_mutex_lock(&mutex);		  
-		  block_count++;
-		  //syslog(LOG_INFO,"block count %"PRIu64"",block_count);
-		  pthread_mutex_unlock(&mutex);
-		  
-		}
-	      // packet belongs in subsequent block
-	      else if (seq_byte > block_end_byte)
-		{
-		      
-		  if (temp_idx < temp_max)
-		    {
-		      // save packet to temp buffer
-		      memcpy (temp_buffers[temp_idx], sock->buf + UDP_HEADER, UDP_DATA);
-		      temp_seq_byte[temp_idx] = seq_byte;
-		      temp_idx++;
-		    }
-		}	    
-	    }
-	  
-	  // threadsafe end of block
-	  pthread_mutex_lock(&mutex);
-	  if ((block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
-	    {
-	      syslog (LOG_INFO, "BLOCK COMPLETE thread_id=%d, seq_no=%"PRIu64", "
-		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
-		      "temp_idx=%d, writeBlock=%d", thread_id, seq_no, ant_id,  block_count, 
-		      temp_idx,writeBlock);
-
-	      // write block
-	      // check whether doWrite has been released. If not, skip this block
-	      if (doWrite==1) skipBlock=1;
-	      else doWrite=1;
-	      
-	      uint64_t dropped = udpdb->packets_per_buffer - (block_count);
-	      udpdb->packets->received += (block_count);
-	      udpdb->bytes->received += (block_count) * UDP_DATA;	      
-	      if (dropped)
-		{
-		  udpdb->packets->dropped += dropped;
-		  udpdb->bytes->dropped += (dropped * UDP_DATA);
-		}
-
-	      // increment counters
-	      dsaX_udpdb_increment(udpdb);	      	
-
-	      // write temp queue for this thread
-	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
-	      tpack = 0;
-	
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - (block_start_byte);
-		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
-		    {
-		      mod_WB = writeBlock % 64;
-		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
-		      //pthread_mutex_lock(&mutex);
-		      block_count++;		      
-		      //pthread_mutex_unlock(&mutex);
-		    }
-		}
-	      temp_idx = 0;
-       
-	    }
-	  pthread_mutex_unlock(&mutex);
-
-	  // at this stage, can try and write temp queue safely for other threads
-	  if (temp_seq_byte[0] >= block_start_byte && temp_seq_byte[0] <= block_end_byte && temp_idx > 0)
-	    {
-	      //syslog(LOG_INFO,"thread %d: packets in this block %"PRIu64", temp_idx %d",thread_id,tpack,temp_idx);
-	      tpack = 0;
-	
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - (block_start_byte);
-		  if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0)
-		    {
-		      mod_WB = writeBlock % 64;
-		      memcpy (udpdb->tblock + byte_offset + mod_WB*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
-		      pthread_mutex_lock(&mutex);
-		      block_count++;		      
-		      pthread_mutex_unlock(&mutex);
-		    }
-		}
-	      temp_idx = 0;
-
-	    }
-
-	}
-
-      // packet has been inserted or saved by this point
-      sock->have_packet = 0;
-	
-    }
-
-  dsaX_free_sock(sock);
-  free(temp_buffers);
-  free(temp_seq_byte);
-  
-}
-
-/* 
- *  Thread to write data
- */
-void write_thread(void * arg) {
-
-  dsaX_write_t * udpdb = (dsaX_write_t *) arg;
-  int thread_id = udpdb->thread_id;
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = write_cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-    
-  int mod_WB = 0;
-  int a;
-  
-  while (!quit_threads)
-  {
-
-    while (!doWrite) {
-      a=1;
-    }    
-
-    // assume everything is set up
-    // wblock is assigned, write_ct=0
-    
-    mod_WB = lWriteBlock % 64;
-    memcpy(wblock + thread_id*udpdb->hdu_bufsz/nwth, udpdb->tblock + mod_WB*udpdb->hdu_bufsz  + thread_id*udpdb->hdu_bufsz/nwth, udpdb->hdu_bufsz/nwth);
-
-    pthread_mutex_lock(&mutex);
-    write_ct++;
-    pthread_mutex_unlock(&mutex);
-
-    //syslog(LOG_INFO,"write thread %d: successfully memcpied",thread_id);
-
-    // now wait until thread 0 has finished getting a new block before moving on
-    if (thread_id>0) {
-      while (write_ct!=0) a=1;
-    }
-    else {
-
-      // wait for all sub-blocks to be written
-      while (write_ct<nwth) a=1;
-
-      // get new block
-      if (dsaX_udpdb_new_buffer (udpdb) < 0)
-	{
-	  syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-	  return EXIT_FAILURE;
-	}
-
-      syslog(LOG_INFO,"write thread %d: written block... %d",thread_id,lWriteBlock);
-      lWriteBlock++;
-
-      // check for skipBlock - only log existence
-      if (skipBlock) {
-	skipct++;	
-      }
-      
-      // update doWrite and skipBlock
-      doWrite=0;
-      skipBlock=0;
-      write_ct = 0;
-
-    }
-     
-  }
-
-}
-
-
-	    
-// MAIN of program
-	
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_capture_manythread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit for writing */
-  dada_hdu_t* hdu_out = 0;
-  
-  // input data block HDU key
-  key_t out_key = CAPTURE_BLOCK_KEY;
-
-  // command line arguments
-  int core = -1;
-  int chgroup = 0;
-  int arg=0;
-  char dada_fnam[200]; // filename for dada header
-  char iface[100]; // IP for data packets
-  
-  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {	      
-	      strcpy(iP,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'g':
-	  if (optarg)
-	    {	      
-	      chgroup = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-g flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {	      
-	      strcpy(iface,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'f':
-	  if (optarg)
-	    {	      
-	      strcpy(dada_fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	 
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // START THREADS
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  udpdb_t temp_str;
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  
-  // OPEN CONNECTION TO DADA DB FOR WRITING
-
-  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
-  
-  hdu_out  = dada_hdu_create ();
-  if (DEBUG) syslog(DEBUG,"Created hdu");
-  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog(LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_out);
-    syslog(LOG_ERR,"could not lock to output dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  syslog(LOG_INFO,"opened connection to output DB");
-
-  // DEAL WITH DADA HEADER
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-  if (DEBUG) syslog(DEBUG,"read header2");
-
-  if (fileread (dada_fnam, hout, 4096) < 0)
-    {
-      free (hout);
-      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
-      return (EXIT_FAILURE);
-    }
-
-  
-  if (DEBUG) syslog(DEBUG,"read header3");
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // copy the in header to the out header
-  memcpy (header_out, hout, 4096);
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
-    {
-      syslog(LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  sprintf(STATE,"LISTEN");
-  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
-
-
-  /* time to start up receiver. 
-     data are captured on iface:CAPTURE_PORT 
-  */
-
-  // make recv, write, and stats structs  
-  udpdb_t udpdb[nth];
-  dsaX_stats_t stats;
-  dsaX_write_t writey[nwth];
-
-  // shared variables and memory
-  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  char * tblock = (char *)malloc(sizeof(char)*bufsz*64);
-  stats_t * packets = init_stats_t();
-  stats_t * bytes = init_stats_t();
-  reset_stats_t(packets);
-  reset_stats_t(bytes);
-
-  // initialise stats struct
-  stats.packets = packets;
-  stats.bytes = bytes;
-
-  // initialise writey struct and open buffer
-  for (int i=0;i<nwth;i++) {
-    writey[i].hdu = hdu_out;
-    writey[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-    writey[i].block_open = 0;
-    writey[i].tblock = tblock;
-    writey[i].thread_id = i;    
-  }
-  dsaX_udpdb_open_buffer (&writey[0]);
-
-  // initialise all udpdb structs
-  for (int i=0;i<nth;i++) {
-
-    // shared stuff
-    udpdb[i].packets = packets;
-    udpdb[i].bytes = bytes;
-    udpdb[i].tblock = tblock;
-
-    // the rest
-    udpdb[i].port = CAPTURE_PORT;
-    udpdb[i].interface = strdup(iface);
-    udpdb[i].hdu_bufsz = bufsz;
-    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
-    udpdb[i].num_inputs = NSNAPS;
-    udpdb[i].verbose = 0;
-    udpdb[i].rcv_sleeps = 0;
-    
-    udpdb[i].thread_id = i;    
-    
-  }
-
-
-  /* start threads */
-    
-  // start the stats thread
-  pthread_t stats_thread_id;
-  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started stats_thread()");
-
-  // start the receive threads
-  pthread_t recv_thread_id[nth];  
-  rval = 0;
-  for (int i=0;i<nth;i++) {
-    rval = pthread_create (&recv_thread_id[i], 0, (void *) recv_thread, (void *) (&udpdb[i]));
-    if (rval != 0) {
-      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "Created recv threads");
-
-  // start the write thread
-  pthread_t write_thread_id[nwth];
-  rval = 0;
-  for (int i=0;i<nwth;i++) {
-    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&writey[i]));
-    if (rval != 0) {
-      syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "started write threads");  
-
-  while (!quit_threads) {
-    sleep(1);
-  }
-  
-  // close threads
-  syslog(LOG_INFO, "joining all threads");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  pthread_join (stats_thread_id, &result);
-  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
-  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
-  
-  free(tblock);
-  dsaX_dbgpu_cleanup (hdu_out);
-
-}
diff --git a/src/dsaX_capture_pcap.c b/src/dsaX_capture_pcap.c
deleted file mode 100644
index 4921c68..0000000
--- a/src/dsaX_capture_pcap.c
+++ /dev/null
@@ -1,852 +0,0 @@
-/* dsaX_capture_pcap.c: Code to capture packets using pf_ring aware pcap and write to a dada buffer.
-
-control and stats threads: standard threads
-recv thread: simply runs pcap_loop, passing packets to callback function
-packet_callback: places packets directly into dada buffer, or temp buffer. gets new buffer if needed
-
-everything is in the dsaX_t structure
-
-
-*/
-
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture_pcap.h"
-#include "dsaX_def.h"
-#include "pcap.h"
-
-/* global variables */
-int quit_threads = 0;
-char STATE[20];
-uint64_t UTC_START = 10000;
-uint64_t UTC_STOP = 40000000000;
-int MONITOR = 0;
-char iP[100];
-int DEBUG = 0;
-int HISTOGRAM[16];
-int cores[2] = {17,19};
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-volatile int canWrite = 0;
-volatile  unsigned capture_started = 0;
-volatile char * wblock;
-volatile uint64_t last_seq;
-const int nth = 1;
-const int nwth = 1;
-const int TEMP_MAXY = 1000;
-volatile int skipped = 0;
-const int NBLOCKS = 8;
-volatile uint64_t writeBlock[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-volatile int delayBlock = 0;
-volatile int behindBlock = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-void usage();
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_capture [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -i IP to listen on for control commands [no default]\n"	
-	   " -f filename of template dada header [no default]\n"
-	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -d send debug messages to syslog\n"
-	   " -h print usage\n");
-}
-
-/* 
- *  open a data block buffer ready for direct access
- */
-int dsaX_udpdb_open_buffer (dsaX_t * ctx);
-int dsaX_udpdb_open_buffer (dsaX_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
-
-  if (ctx->block_open)
-  {
-    syslog (LOG_ERR, "open_buffer: buffer already opened");
-    return -1;
-  }
-
-  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
-
-  uint64_t block_id = 0;
-
-  wblock = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
-  if (!wblock)
-  { 
-    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
-    return -1;
-  }
-
-  ctx->block_open = 1;
-
-  return 0;
-}
-
-/*
- *  close a data buffer, assuming a full block has been written
- */
-int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod);
-int dsaX_udpdb_close_buffer (dsaX_t * ctx, uint64_t bytes_written, unsigned eod)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
-
-  if (!ctx->block_open)
-  { 
-    syslog (LOG_ERR, "close_buffer: buffer already closed");
-    return -1;
-  }
-
-  // log any buffers that are not full, except for the 1 byte "EOD" buffer
-  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
-    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
-              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
-              bytes_written, ctx->hdu_bufsz);
-
-  if (eod)
-  {
-    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
-      return -1;
-    }
-  }
-  else 
-  {
-    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
-      return -1;
-    }
-  }
-
-  wblock = 0;
-  ctx->block_open = 0;
-
-  return 0;
-}
-
-/* 
- *  move to the next ring buffer element. return pointer to base address of new buffer
- */
-int dsaX_udpdb_new_buffer (dsaX_t * ctx);
-int dsaX_udpdb_new_buffer (dsaX_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
-
-  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
-    return -1;
-  }
-
-  if (dsaX_udpdb_open_buffer (ctx) < 0) 
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-  return 0;
-
-}
-
-// increment counters when block is full
-void dsaX_udpdb_increment (dsaX_t * ctx);
-void dsaX_udpdb_increment (dsaX_t * ctx)
-{
-
-  // increment buffer byte markers
-  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
-  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
-  ctx->block_count = 0;
-
-}
-
-
-
-/* --------- THREADS -------- */
-
-// STATS THREAD
-
-/* 
- *  Thread to print simple capture statistics
- */
-void stats_thread(void * arg) {
-  
-  dsaX_stats_t * ctx = (dsaX_stats_t *) arg;
-  uint64_t b_rcv_total = 0;
-  uint64_t b_rcv_1sec = 0;
-  uint64_t b_rcv_curr = 0;
-
-  uint64_t b_drp_total = 0;
-  uint64_t b_drp_1sec = 0;
-  uint64_t b_drp_curr = 0;
-
-  uint64_t s_rcv_total = 0;
-  uint64_t s_rcv_1sec = 0;
-  uint64_t s_rcv_curr = 0;
-
-  uint64_t ooo_pkts = 0;
-  float gb_rcv_ps = 0;
-  float mb_rcv_ps = 0;
-  float mb_drp_ps = 0;
-
-  syslog(LOG_INFO,"starting stats thread...");
-  sleep(2);
-  syslog(LOG_INFO,"started stats thread...");
-  
-  while (!quit_threads)
-  {
-
-    /* get a snapshot of the data as quickly as possible */
-    b_rcv_curr = ctx->bytes->received;
-    b_drp_curr = ctx->bytes->dropped;
-    
-    /* calc the values for the last second */
-    b_rcv_1sec = b_rcv_curr - b_rcv_total;
-    b_drp_1sec = b_drp_curr - b_drp_total;
-
-    /* update the totals */
-    b_rcv_total = b_rcv_curr;
-    b_drp_total = b_drp_curr;
-
-    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
-    mb_drp_ps = (double) b_drp_1sec / 1000000;
-    gb_rcv_ps = b_rcv_1sec * 8;
-    gb_rcv_ps /= 1000000000;    
-
-    /* determine how much memory is free in the receivers */
-    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64" skipped %d %d", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, last_seq, behindBlock, skipped);
-
-    sleep(1);
-  }
-
-}
-
-// CONTROL THREAD
-
-void control_thread (void * arg) {
-
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = CAPTURE_CONTROL_PORT;
-  char sport[10];
-  sprintf(sport,"%d",port);
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-
-    // INTERPRET BUFFER STRING
-    // receive either UTC_START, UTC_STOP, MONITOR
-
-    // interpret buffer string
-    char * rest = buffer;
-    char *cmd, *val;
-    cmd = strtok_r(rest, "-", &rest);
-    val = strtok_r(rest, "-", &rest);
-    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
-
-    if (strcmp(cmd,"UTC_START")==0)
-      UTC_START = strtoull(val,&endptr,0);
-
-    if (strcmp(cmd,"UTC_STOP")==0)
-      UTC_STOP = strtoull(val,&endptr,0);    
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-/*
-This is important - packet callback function to place packets in buffer
-called upon single packet being received
-*/
-void packet_callback(u_char *args, const struct pcap_pkthdr* header, const u_char* packet) {
-
-  dsaX_t * udpdb = (dsaX_t *) args;
-
-  // make sure packet has right length and get payload
-  if (header->len != UDP_PAYLOAD + 42) {
-    syslog(LOG_INFO,"received packet with length %d, total available %d",header->len,header->caplen);
-    return;
-  }
-  char *buf = (char *)(packet + 42);
-  
-  // process packet header
-  uint64_t seq_no=0, ant_id=0;
-  seq_no |=  (((uint64_t)(buf[4]) & 224) >> 5) & 7;
-  seq_no |=  (((uint64_t)(buf[3])) << 3) & 2040;
-  seq_no |=  (((uint64_t)(buf[2])) << 11) & 522240;
-  seq_no |=  (((uint64_t)(buf[1])) << 19) & 133693440;
-  seq_no |=  (((uint64_t)(buf[0])) << 27) & 34225520640;
-  ant_id |= (unsigned char) (buf[6]) << 8;
-  ant_id |= (unsigned char) (buf[7]);	  
-  uint64_t act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
-  uint64_t block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
-  last_seq = seq_no;
-    
-  // check for starting condition
-  if (canWrite==0) {
-    if (seq_no >= UTC_START-500 && UTC_START != 10000) {
-      canWrite=1;	      
-    }
-  }
-  if (canWrite == 0) return;
-
-  // deal with start of capture
-  if (!(capture_started))
-    {
-      udpdb->block_start_byte = block_seq_no * UDP_DATA;
-      udpdb->block_end_byte   = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
-      capture_started = 1;      
-      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte);
-    }
-
-  // if capture has started, do good stuff
-  uint64_t byte_offset, seq_byte;
-  if (capture_started) {
-
-    seq_byte = (act_seq_no * UDP_DATA);
-
-    // packet belongs in this block
-    if ((seq_byte <= udpdb->block_end_byte) && (seq_byte >= udpdb->block_start_byte))
-      {
-	byte_offset = seq_byte - (udpdb->block_start_byte);
-	memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, buf + UDP_HEADER, UDP_DATA);	
-	//memcpy(wblock + byte_offset, buf + UDP_HEADER, UDP_DATA);
-	udpdb->block_count++;
-      }
-    // packet belongs in subsequent block
-    else if (seq_byte > udpdb->block_end_byte)
-      {
-	if (udpdb->temp_idx < TEMP_MAXY)
-	  {
-	    // save packet to temp buffer
-	    memcpy (udpdb->temp_buffers + udpdb->temp_idx*UDP_DATA, buf + UDP_HEADER, UDP_DATA);
-	    udpdb->temp_seq_byte[udpdb->temp_idx] = seq_byte;
-	    udpdb->temp_idx++;
-	  }
-      }
-  }
-
-  // end of block
-  if ((udpdb->block_count >= udpdb->packets_per_buffer) || (udpdb->temp_idx >= TEMP_MAXY))
-    {
-      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
-	      "ant_id=%"PRIu16", block_count=%"PRIu64", "
-	      "temp_idx=%d", seq_no, ant_id,
-	      udpdb->block_count, udpdb->temp_idx);
-
-      // set write block on this block
-      if (writeBlock[udpdb->tblock_idx]==1)
-	skipped++;
-      writeBlock[udpdb->tblock_idx] = 1;
-      
-      // increment tblock_idx
-      udpdb->tblock_idx+=1;
-      if (udpdb->tblock_idx==NBLOCKS)
-	udpdb->tblock_idx = 0;
-
-      // get delay_block
-      udpdb->nblocks_written++;
-      behindBlock = udpdb->nblocks_written - delayBlock;
-      
-      // deal with counters
-      uint64_t dropped = udpdb->packets_per_buffer - (udpdb->block_count);
-      udpdb->packets->received += (udpdb->block_count);
-      udpdb->bytes->received += (udpdb->block_count) * UDP_DATA;
-      if (dropped)
-	{
-	  udpdb->packets->dropped += dropped;
-	  udpdb->bytes->dropped += (dropped * UDP_DATA);
-	}
-      dsaX_udpdb_increment(udpdb);
-
-      // write temp queue
-      for (int i=0; i < udpdb->temp_idx; i++) {
-	seq_byte = udpdb->temp_seq_byte[i];
-	byte_offset = seq_byte - udpdb->block_start_byte;
-	if (byte_offset < udpdb->hdu_bufsz && byte_offset >= 0) {
-	  memcpy(udpdb->tblock + udpdb->tblock_idx*NPACKETS_PER_BLOCK*NSNAPS*UDP_DATA + byte_offset, udpdb->temp_buffers + i*UDP_DATA, UDP_DATA);
-	  udpdb->block_count++;
-	}
-      }
-      udpdb->temp_idx = 0;
-
-    }	  
- 
-}
-
-// Thread to do writing
-
-void write_thread(void * arg) {
-
-  dsaX_t * udpdb = (dsaX_t *) arg;
-  int thread_id = 2;
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[1];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-
-  int a, lWriteBlock=0;
-  while (!quit_threads) {
-
-    // busywait
-    while (writeBlock[lWriteBlock]==0)
-      a=1;
-
-    // write block
-    memcpy(wblock, udpdb->tblock + lWriteBlock*UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK, UDP_DATA*NSNAPS*NPACKETS_PER_BLOCK);
-
-    // get new block
-    if (dsaX_udpdb_new_buffer (udpdb) < 0)
-      {
-	syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-	return EXIT_FAILURE;
-      }
-
-    // increment counters    
-    writeBlock[lWriteBlock] = 0;
-    lWriteBlock++;
-    if (lWriteBlock==NBLOCKS)
-      lWriteBlock = 0;
-    delayBlock++;
-    
-  }
-}
-
-/*
-Thread to run pcap, passing to callback function
-*/
-
-void pcap_thread(void * arg) {
-
-  dsaX_t * udpdb = (dsaX_t *) arg;
-  int thread_id = 1;//udpdb->thread_id;
-    
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[0];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-
-  // set up pcap from port CAPTURE_PORT
-  char dev[] = "eth0";
-  pcap_t *handle;
-  char error_buffer[PCAP_ERRBUF_SIZE];
-  struct bpf_program filter;
-  char filter_exp[] = "port 4011";
-  bpf_u_int32 subnet_mask, ip;
-
-  if (pcap_lookupnet(dev, &ip, &subnet_mask, error_buffer) == -1) {
-    syslog(LOG_ERR,"Could not get information for device: %s", dev);
-    ip = 0;
-    subnet_mask = 0;
-  }
-  handle = pcap_open_live(dev, 4659, 0, 1, error_buffer);
-  if (handle == NULL) {
-    syslog(LOG_ERR,"Could not open %s - %s", dev, error_buffer);
-    return 2;
-  }
-  
-  if (pcap_compile(handle, &filter, filter_exp, 1, ip) == -1) {
-    syslog(LOG_ERR,"Bad filter - %s", pcap_geterr(handle));
-    return 2;
-  }
-  if (pcap_setfilter(handle, &filter) == -1) {
-    syslog(LOG_ERR,"Error setting filter - %s\n", pcap_geterr(handle));
-    return 2;
-  }
-
-  /*  if((pcap_set_buffer_size(handle, 2*1024*1024))!=0)
-    {
-      syslog(LOG_ERR, "Could not set buffer size");
-      return 2;
-      }*/
-
-  
-  syslog(LOG_INFO,"thread %d: successfully set up pcap",thread_id);
-
-  // start up RX!
-  while (!quit_threads)
-    pcap_loop(handle, 0, packet_callback, (u_char*)udpdb);
-
-  // finish
-  pcap_close(handle);
-  
-}
-
-
-	    
-// MAIN of program
-	
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_capture_pcap", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit for writing */
-  dada_hdu_t* hdu_out = 0;
-  
-  // input data block HDU key
-  key_t out_key = CAPTURE_BLOCK_KEY;
-
-  // command line arguments
-  int core = -1;
-  int arg=0;
-  char dada_fnam[200]; // filename for dada header
-  
-  while ((arg=getopt(argc,argv,"c:i:f:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {	      
-	      strcpy(iP,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'f':
-	  if (optarg)
-	    {	      
-	      strcpy(dada_fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	 
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // START THREADS
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  dsaX_t temp_str;
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &temp_str);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  
-  // OPEN CONNECTION TO DADA DB FOR WRITING
-
-  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
-  
-  hdu_out  = dada_hdu_create ();
-  if (DEBUG) syslog(DEBUG,"Created hdu");
-  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog(LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_out);
-    syslog(LOG_ERR,"could not lock to output dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  syslog(LOG_INFO,"opened connection to output DB");
-
-  // DEAL WITH DADA HEADER
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-  if (DEBUG) syslog(DEBUG,"read header2");
-
-  if (fileread (dada_fnam, hout, 4096) < 0)
-    {
-      free (hout);
-      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
-      return (EXIT_FAILURE);
-    }
-
-  
-  if (DEBUG) syslog(DEBUG,"read header3");
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // copy the in header to the out header
-  memcpy (header_out, hout, 4096);
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
-    {
-      syslog(LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  sprintf(STATE,"LISTEN");
-  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
-
-
-  /* time to start up receiver. 
-  */
-
-  // make recv, write, and stats structs  
-  dsaX_t udpdb[nth];
-  dsaX_stats_t stats;
-
-  // shared variables and memory
-  uint64_t bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);  
-  stats_t * packets = init_stats_t();
-  stats_t * bytes = init_stats_t();
-  reset_stats_t(packets);
-  reset_stats_t(bytes);
-  char * tblock = (char *)malloc(sizeof(char)*NBLOCKS*(ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block)));
-  char * temp_buffers = (char *)malloc(sizeof(char)*TEMP_MAXY*UDP_DATA);
-  char * temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*TEMP_MAXY);
-  
-  // initialise stats struct
-  stats.packets = packets;
-  stats.bytes = bytes;
-
-  for (int i=0;i<nth;i++) {
-
-    udpdb[i].hdu = hdu_out;
-    udpdb[i].hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-    udpdb[i].block_open = 0;
-    udpdb[i].block_count = 0;
-    udpdb[i].tblock = tblock;
-    udpdb[i].tblock_idx = 0;
-    udpdb[i].temp_buffers = temp_buffers;
-    udpdb[i].temp_seq_byte = temp_seq_byte;
-    udpdb[i].temp_idx = 0;
-    udpdb[i].thread_id = 1;
-    udpdb[i].verbose = 0;
-    udpdb[i].packets_per_buffer = udpdb[i].hdu_bufsz / UDP_DATA;
-    udpdb[i].packets = packets;
-    udpdb[i].bytes = bytes;
-    udpdb[i].nblocks_written = 0;
-
-  }    
-  dsaX_udpdb_open_buffer (&udpdb[0]);
-
-  /* start threads */
-    
-  // start the stats thread
-  pthread_t stats_thread_id;
-  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &stats);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started stats_thread()");
-
-  // start the receive threads
-  pthread_t recv_thread_id[nth];  
-  rval = 0;
-  for (int i=0;i<nth;i++) {
-    rval = pthread_create (&recv_thread_id[i], 0, (void *) pcap_thread, (void *) (&udpdb[i]));
-    if (rval != 0) {
-      syslog(LOG_ERR, "Error creating recv_thread %d: %s", i,strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "Created recv threads");
-
-  // start the write threads
-  pthread_t write_thread_id[nwth];  
-  rval = 0;
-  for (int i=0;i<nwth;i++) {
-    rval = pthread_create (&write_thread_id[i], 0, (void *) write_thread, (void *) (&udpdb[i]));
-    if (rval != 0) {
-      syslog(LOG_ERR, "Error creating write_thread %d: %s", i,strerror(rval));
-      return -1;
-    }
-  }
-  syslog(LOG_NOTICE, "Created write threads");
-
-  
-  while (!quit_threads) {
-    sleep(1);
-  }
-  
-  // close threads
-  syslog(LOG_INFO, "joining all threads");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  pthread_join (stats_thread_id, &result);
-  for (int i=0;i<nth;i++) pthread_join(recv_thread_id[i], &result);
-  for (int i=0;i<nwth;i++) pthread_join(write_thread_id[i], &result);
-  
-  free(tblock);
-  free(temp_buffers);
-  free(temp_seq_byte);
-  dsaX_dbgpu_cleanup (hdu_out);
-
-}
diff --git a/src/dsaX_capture_thread.c b/src/dsaX_capture_thread.c
deleted file mode 100644
index 49019be..0000000
--- a/src/dsaX_capture_thread.c
+++ /dev/null
@@ -1,1107 +0,0 @@
-/* dsaX_capture.c: Code to capture packets over a socket and write to a dada buffer.
-
-main: runs capture loop, and interfaces dada buffer
-control_thread: deals with control commands
-
-*/
-
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int quit_threads = 0;
-char STATE[20];
-uint64_t UTC_START = 10000;
-uint64_t UTC_STOP = 40000000000;
-int MONITOR = 0;
-char iP[100];
-int DEBUG = 0;
-int HISTOGRAM[16];
-int writeBlock = 0;
-volatile int doWrite = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_capture [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -j IP to listen on for data packets [no default]\n"
-	   " -i IP to listen on for control commands [no default]\n"	
-	   " -f filename of template dada header [no default]\n"
-	   " -o out_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -d send debug messages to syslog\n"
-	   " -g chgroup [default 0]\n"
-	   " -h print usage\n");
-}
-
-/*
- * create a socket with the specified number of buffers
- */
-dsaX_sock_t * dsaX_init_sock ()
-{
-  dsaX_sock_t * b = (dsaX_sock_t *) malloc(sizeof(dsaX_sock_t));
-  assert(b != NULL);
-
-  b->bufsz = sizeof(char) * UDP_PAYLOAD;
-
-  b->buf = (char *) malloc (b->bufsz);
-  assert(b->buf != NULL);
-
-  b->have_packet = 0;
-  b->fd = 0;
-
-  return b;
-}
-
-void dsaX_free_sock(dsaX_sock_t* b)
-{
-  b->fd = 0;
-  b->bufsz = 0;
-  b->have_packet =0;
-  if (b->buf)
-    free (b->buf);
-  b->buf = 0;
-}
-
-/* 
- *  intialize UDP receiver resources
- */
-int dsaX_udpdb_init_receiver (udpdb_t * ctx)
-{
-  syslog(LOG_INFO,"dsax_udpdb_init_receiver()");
-
-  // create a dsaX socket which can hold variable num of UDP packet
-  ctx->sock = dsaX_init_sock();
-
-  ctx->ooo_packets = 0;
-  ctx->recv_core = -1;
-  ctx->n_sleeps = 0;
-  ctx->mb_rcv_ps = 0;
-  ctx->mb_drp_ps = 0;
-  ctx->block_open = 0;
-  ctx->block_count = 0;
-  ctx->capture_started = 0;
-  ctx->last_seq = 0;
-  ctx->last_byte = 0;
-  ctx->block_start_byte = 0;
-
-  // allocate required memory strucutres
-  ctx->packets = init_stats_t();
-  ctx->bytes   = init_stats_t();
-  return 0;
-}
-
-/* 
-prepare socket and writer
-*/
-
-int dsaX_udpdb_prepare (udpdb_t * ctx)
-{
-  syslog(LOG_INFO, "dsaX_udpdb_prepare()");
-
-  // open socket
-  syslog(LOG_INFO, "prepare: creating udp socket on %s:%d", ctx->interface, ctx->port);
-  ctx->sock->fd = dada_udp_sock_in(ctx->log, ctx->interface, ctx->port, ctx->verbose);
-  if (ctx->sock->fd < 0) {
-    syslog (LOG_ERR, "Error, Failed to create udp socket");
-    return -1;
-  }
-
-  
-  // set the socket size to 256 MB
-  int sock_buf_size = 256*1024*1024;
-  syslog(LOG_INFO, "prepare: setting buffer size to %d", sock_buf_size);
-  dada_udp_sock_set_buffer_size (ctx->log, ctx->sock->fd, ctx->verbose, sock_buf_size);
-
-  // set the socket to non-blocking
-  syslog(LOG_INFO, "prepare: setting non_block");
-  sock_nonblock(ctx->sock->fd);
-
-  // clear any packets buffered by the kernel
-  syslog(LOG_INFO, "prepare: clearing packets at socket");
-  size_t cleared = dada_sock_clear_buffered_packets(ctx->sock->fd, UDP_PAYLOAD);
-
-  // setup the next_seq to the initial value
-  //ctx->last_seq = 0;
-  //ctx->last_byte = 0;
-  //ctx->n_sleeps = 0;
-
-  return 0;
-}
-
-/*
- *  reset receiver before an observation commences
- */
-void dsaX_udpdb_reset_receiver (udpdb_t * ctx) 
-{
-  syslog (LOG_INFO, "dsaX_udpdb_reset_receiver()");
-
-  ctx->capture_started = 0;
-  ctx->last_seq = 0;
-  ctx->last_byte = 0;
-  ctx->n_sleeps = 0;
-
-  reset_stats_t(ctx->packets);
-  reset_stats_t(ctx->bytes);
-}
-
-/* 
- *  open a data block buffer ready for direct access
- */
-int dsaX_udpdb_open_buffer (udpdb_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_open_buffer()");
-
-  if (ctx->block_open)
-  {
-    syslog (LOG_ERR, "open_buffer: buffer already opened");
-    return -1;
-  }
-
-  if (DEBUG) syslog (LOG_DEBUG, "open_buffer: ipcio_open_block_write");
-
-  uint64_t block_id = 0;
-
-  ctx->block = ipcio_open_block_write (ctx->hdu->data_block, &block_id);
-  if (!ctx->block)
-  { 
-    syslog (LOG_ERR, "open_buffer: ipcio_open_block_write failed");
-    return -1;
-  }
-
-  ctx->block_open = 1;
-
-  return 0;
-}
-
-/*
- *  close a data buffer, assuming a full block has been written
- */
-int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_close_buffer(%"PRIu64", %d)", bytes_written, eod);
-
-  if (!ctx->block_open)
-  { 
-    syslog (LOG_ERR, "close_buffer: buffer already closed");
-    return -1;
-  }
-
-  // log any buffers that are not full, except for the 1 byte "EOD" buffer
-  if ((bytes_written != 1) && (bytes_written != ctx->hdu_bufsz))
-    syslog ((eod ? LOG_INFO : LOG_WARNING), "close_buffer: "
-              "bytes_written[%"PRIu64"] != hdu_bufsz[%"PRIu64"]", 
-              bytes_written, ctx->hdu_bufsz);
-
-  if (eod)
-  {
-    if (ipcio_update_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_update_block_write failed");
-      return -1;
-    }
-  }
-  else 
-  {
-    if (ipcio_close_block_write (ctx->hdu->data_block, bytes_written) < 0)
-    {
-      syslog (LOG_ERR, "close_buffer: ipcio_close_block_write failed");
-      return -1;
-    }
-  }
-
-  ctx->block = 0;
-  ctx->block_open = 0;
-
-  return 0;
-}
-
-// increment counters when block is full
-int dsaX_udpdb_increment (udpdb_t * ctx)
-{
-
-  // increment buffer byte markers
-  ctx->block_start_byte = ctx->block_end_byte + UDP_DATA;
-  ctx->block_end_byte = ctx->block_start_byte + ( ctx->packets_per_buffer - 1) * UDP_DATA;
-  ctx->block_count = 0;
-  if (writeBlock==0) writeBlock=1;
-  else writeBlock=0;
-
-}
-
-/* 
- *  move to the next ring buffer element. return pointer to base address of new buffer
- */
-int dsaX_udpdb_new_buffer (udpdb_t * ctx)
-{
-
-  if (DEBUG) syslog (LOG_DEBUG, "dsaX_udpdb_new_buffer()");
-
-  if (dsaX_udpdb_close_buffer (ctx, ctx->hdu_bufsz, 0) < 0)
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_close_buffer failed");
-    return -1;
-  }
-
-  if (dsaX_udpdb_open_buffer (ctx) < 0) 
-  {
-    syslog (LOG_ERR, "new_buffer: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-
-  // set block to 0
-  //memset(ctx->block,0,ctx->block_end_byte-ctx->block_start_byte);
-  
-  if (DEBUG) syslog(LOG_DEBUG, "new_buffer: buffer_bytes [%"PRIu64" - %"PRIu64"]", 
-             ctx->block_start_byte, ctx->block_end_byte);
-
-  return 0;
-
-}
-
-/* 
- *  destroy UDP receiver resources 
- */
-int dsaX_udpdb_destroy_receiver (udpdb_t * ctx)
-{
-  if (ctx->sock)
-    dsaX_free_sock(ctx->sock);
-  ctx->sock = 0;
-}
-
-/*
- * Close the udp socket and file
- */
-
-int udpdb_stop_function (udpdb_t* ctx)
-{
-
-  syslog(LOG_INFO, "stop: dada_hdu_unlock_write()");
-  if (dada_hdu_unlock_write (ctx->hdu) < 0)
-  {
-    syslog (LOG_ERR, "stop: could not unlock write on");
-    return -1;
-  }
-
-  // close the UDP socket
-  close(ctx->sock->fd);
-
-  if (ctx->packets->dropped)
-  {
-    double percent = (double) ctx->bytes->dropped / (double) ctx->last_byte;
-    percent *= 100;
-
-    syslog(LOG_INFO, "bytes dropped %"PRIu64" / %"PRIu64 " = %8.6f %",
-             ctx->bytes->dropped, ctx->last_byte, percent);
-  }
-
-  return 0;
-}
-
-
-
-
-/* --------- THREADS -------- */
-
-// STATS THREAD
-
-/* 
- *  Thread to print simple capture statistics
- */
-void stats_thread(void * arg) {
-
-  /*  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = 4;
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-  */
-  
-  udpdb_t * ctx = (udpdb_t *) arg;
-  uint64_t b_rcv_total = 0;
-  uint64_t b_rcv_1sec = 0;
-  uint64_t b_rcv_curr = 0;
-
-  uint64_t b_drp_total = 0;
-  uint64_t b_drp_1sec = 0;
-  uint64_t b_drp_curr = 0;
-
-  uint64_t s_rcv_total = 0;
-  uint64_t s_rcv_1sec = 0;
-  uint64_t s_rcv_curr = 0;
-
-  uint64_t ooo_pkts = 0;
-  float gb_rcv_ps = 0;
-  float mb_rcv_ps = 0;
-  float mb_drp_ps = 0;
-
-  syslog(LOG_INFO,"starting stats thread...");
-  sleep(2);
-  syslog(LOG_INFO,"started stats thread...");
-  
-  while (!quit_threads)
-  {
-
-    /* get a snapshot of the data as quickly as possible */
-    b_rcv_curr = ctx->bytes->received;
-    b_drp_curr = ctx->bytes->dropped;
-    s_rcv_curr = ctx->n_sleeps;
-    
-    /* calc the values for the last second */
-    b_rcv_1sec = b_rcv_curr - b_rcv_total;
-    b_drp_1sec = b_drp_curr - b_drp_total;
-    s_rcv_1sec = s_rcv_curr - s_rcv_total;
-
-    /* update the totals */
-    b_rcv_total = b_rcv_curr;
-    b_drp_total = b_drp_curr;
-    s_rcv_total = s_rcv_curr;
-
-    mb_rcv_ps = (double) b_rcv_1sec / 1000000;
-    mb_drp_ps = (double) b_drp_1sec / 1000000;
-    gb_rcv_ps = b_rcv_1sec * 8;
-    gb_rcv_ps /= 1000000000;    
-
-    /* determine how much memory is free in the receivers */
-    syslog (LOG_NOTICE,"CAPSTATS %6.3f [Gb/s], D %4.1f [MB/s], D %"PRIu64" pkts, %"PRIu64"", gb_rcv_ps, mb_drp_ps, ctx->packets->dropped, ctx->last_seq);
-
-    sleep(1);
-  }
-
-}
-
-
-
-
-
-
-
-// CONTROL THREAD
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = CAPTURE_CONTROL_PORT;
-  char sport[10];
-  sprintf(sport,"%d",port);
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-
-    // INTERPRET BUFFER STRING
-    // receive either UTC_START, UTC_STOP, MONITOR
-
-    // interpret buffer string
-    char * rest = buffer;
-    char *cmd, *val;
-    cmd = strtok_r(rest, "-", &rest);
-    val = strtok_r(rest, "-", &rest);
-    syslog(LOG_INFO, "control_thread: split into COMMAND %s, VALUE %s",cmd,val);
-
-    if (strcmp(cmd,"UTC_START")==0)
-      UTC_START = strtoull(val,&endptr,0);
-
-    if (strcmp(cmd,"UTC_STOP")==0)
-      UTC_STOP = strtoull(val,&endptr,0);    
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-
-/* 
- *  Thread to capture data
- */
-int recv_thread(void * arg) {
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = 34;
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-  
-  
-  udpdb_t * udpdb = (udpdb_t *) arg;
-
-    /* START WHAT WAS in RECV THREAD */
-
-  // DEFINITIONS
-
-  uint64_t act_seq_no = 0;
-  uint64_t block_seq_no = 0;
-  uint64_t seq_no = 0;
-  uint64_t ch_id = 0;
-  uint64_t ant_id = 0;
-  unsigned char * b = (unsigned char *) udpdb->sock->buf;
-  size_t got = 0; // data received from a recv_from call
-  int errsv; // determine the sequence number boundaries for curr and next buffers
-  int64_t byte_offset = 0; // offset of current packet in bytes from start of block
-  uint64_t seq_byte = 0; // offset of current packet in bytes from start of obs
-  // for "saving" out of order packets near edges of blocks
-  unsigned int temp_idx = 0;
-  unsigned int temp_max = 1000;
-  char ** temp_buffers; //[temp_max][UDP_DATA];
-  uint64_t * temp_seq_byte;
-  temp_buffers = (char **)malloc(sizeof(char *)*temp_max);
-  for (int i=0;i<temp_max;i++) temp_buffers[i] = (char *)malloc(sizeof(char)*UDP_DATA);
-  temp_seq_byte = (uint64_t *)malloc(sizeof(uint64_t)*temp_max);
-  unsigned i = 0;
-  uint64_t timeouts = 0;
-  uint64_t timeout_max = 1000000000;
-  int canWrite = 0;
-  int ct_snaps=0;
-
-  // infinite loop to receive packets
-  // use stats thread to monitor STATE at this stage, to save resources here
-
-  while (!quit_threads)
-    {
-
-      udpdb->sock->have_packet = 0; 
-
-      // incredibly tight loop to try and get a packet
-      while (!udpdb->sock->have_packet)
-	{
-	 
-	  // receive 1 packet into the socket buffer
-	  got = recvfrom ( udpdb->sock->fd, udpdb->sock->buf, UDP_PAYLOAD, 0, NULL, NULL );
-
-	  if (got == UDP_PAYLOAD) 
-	    {
-	      udpdb->sock->have_packet = 1;
-	    } 
-	  else if (got == -1) 
-	    {
-	      errsv = errno;
-	      if (errsv == EAGAIN) 
-		{
-		  udpdb->n_sleeps++;
-		  if (udpdb->capture_started)
-		    timeouts++;
-		  if (timeouts > timeout_max)
-		    syslog(LOG_INFO, "timeouts[%"PRIu64"] > timeout_max[%"PRIu64"]\n",timeouts, timeout_max);		  
-		}
-	      else 
-		{
-		  syslog (LOG_ERR, "receive_obs: recvfrom failed %s", strerror(errsv));
-		  return EXIT_FAILURE;
-		}
-	    } 
-	  else // we received a packet of the WRONG size, ignore it
-	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
-	    }
-	}
-      timeouts = 0;
-
-      // we have a valid packet within the timeout
-      if (udpdb->sock->have_packet) 
-	{
-
-	  // decode packet header (64 bits)
-	  // 35 bits seq_no (for first spectrum in packet); 13 bits ch_id (for first channel in packet); 16 bits ant ID (for first antenna in packet)
-	  seq_no = 0;
-	  seq_no |=  (((uint64_t)(udpdb->sock->buf[4]) & 224) >> 5) & 7;
-	  seq_no |=  (((uint64_t)(udpdb->sock->buf[3])) << 3) & 2040;
-	  seq_no |=  (((uint64_t)(udpdb->sock->buf[2])) << 11) & 522240;
-	  seq_no |=  (((uint64_t)(udpdb->sock->buf[1])) << 19) & 133693440;
-	  seq_no |=  (((uint64_t)(udpdb->sock->buf[0])) << 27) & 34225520640;
-	  ant_id = 0;
-	  ant_id |= (unsigned char) (udpdb->sock->buf[6]) << 8;
-	  ant_id |= (unsigned char) (udpdb->sock->buf[7]);
-	  
-	  act_seq_no = seq_no*NCHANG*NSNAPS/2 + ant_id*NCHANG/3; // actual seq no
-	  block_seq_no = UTC_START*NCHANG*NSNAPS/2; // seq no corresponding to ant 0 and start of block
-
-	  // check for starting or stopping condition, using continue
-	  if (canWrite==0) {
-	    if (seq_no >= UTC_START-50 && UTC_START != 10000) ct_snaps++;
-	    if (ct_snaps >= 10) canWrite=1;
-	  }
-	  udpdb->last_seq = seq_no;
-	  if (canWrite == 0) continue;
-	  
-	  // if first packet
-	  if (!udpdb->capture_started)
-	    {
-	      udpdb->block_start_byte = block_seq_no * UDP_DATA;
-	      udpdb->block_end_byte   = (udpdb->block_start_byte + udpdb->hdu_bufsz) - UDP_DATA;
-	      udpdb->capture_started = 1;
-
-	      syslog (LOG_INFO, "receive_obs: START [%"PRIu64" - %"PRIu64"]", udpdb->block_start_byte, udpdb->block_end_byte);
-	    }
-
-	  // if capture running
-	  if (udpdb->capture_started)
-	    {
-	      seq_byte = (act_seq_no * UDP_DATA);	      
-
-	      udpdb->last_byte = seq_byte;
-	      
-	      // if packet arrived too late, ignore
-	      if (seq_byte < udpdb->block_start_byte)
-		{
-		  udpdb->packets->dropped++;
-		  udpdb->bytes->dropped += UDP_DATA;
-		}
-	      else
-		{
-		  // packet belongs in this block
-		  if (seq_byte <= udpdb->block_end_byte)
-		    {
-		      byte_offset = seq_byte - udpdb->block_start_byte;
-		      memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, udpdb->sock->buf + UDP_HEADER, UDP_DATA);
-		      udpdb->packets->received++;
-		      udpdb->bytes->received += UDP_DATA;
-		      udpdb->block_count++;
-		    }
-		  // packet belongs in subsequent block
-		  else
-		    {
-		      
-		      if (temp_idx < temp_max)
-			{
-			  // save packet to temp buffer
-			  memcpy (temp_buffers[temp_idx], udpdb->sock->buf + UDP_HEADER, UDP_DATA);
-			  temp_seq_byte[temp_idx] = seq_byte;
-			  temp_idx++;
-			}
-		      else
-			{
-			  udpdb->packets->dropped++;
-			  udpdb->bytes->dropped += UDP_DATA;
-			}
-		    }
-		}
-	    }
-
-	  // now check for a full buffer or full temp queue
-	  if ((udpdb->block_count >= udpdb->packets_per_buffer) || (temp_idx >= temp_max))
-	    {
-	      syslog (LOG_INFO, "BLOCK COMPLETE seq_no=%"PRIu64", "
-		      "ant_id=%"PRIu16", block_count=%"PRIu64", "
-		      "temp_idx=%d\n", seq_no, ant_id,  udpdb->block_count, 
-		      temp_idx);
-
-	      // write block
-	      doWrite=1;
-	      
-	      uint64_t dropped = udpdb->packets_per_buffer - udpdb->block_count;
-	      if (dropped)
-		{
-		  udpdb->packets->dropped += dropped;
-		  udpdb->bytes->dropped += (dropped * UDP_DATA);
-		}
-
-	      // increment counters
-	      dsaX_udpdb_increment(udpdb);
-
-	      // write any temp packets saved
-
-	      if (DEBUG) syslog(LOG_INFO, "block bytes: %"PRIu64" - %"PRIu64"\n", udpdb->block_start_byte, udpdb->block_end_byte);
-  
-	      // include any futuristic packets we saved
-	      for (i=0; i < temp_idx; i++)
-		{
-		  seq_byte = temp_seq_byte[i];
-		  byte_offset = seq_byte - udpdb->block_start_byte;
-		  if (byte_offset < udpdb->hdu_bufsz)
-		    {
-		      memcpy (udpdb->tblock + byte_offset + writeBlock*udpdb->hdu_bufsz, temp_buffers[i], UDP_DATA);
-		      udpdb->block_count++;
-		      udpdb->packets->received++;
-		      udpdb->bytes->received += UDP_DATA;
-		    }
-		  else
-		    {
-		      udpdb->packets->dropped++;
-		      udpdb->bytes->dropped += UDP_DATA;
-		    }
-		}
-	      temp_idx = 0;
-	    }	     
-
-	}
-
-      // packet has been inserted or saved by this point
-      udpdb->sock->have_packet = 0;
-      
-	
-    }
-
-
-  free(temp_buffers);
-  free(temp_seq_byte);
-  
-}
-
-/* 
- *  Thread to write data
- */
-int write_thread(void * arg) {
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = 36;
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",core_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",core_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    syslog(LOG_INFO,"thread %d: successfully set thread",core_id);
-  
-  
-  udpdb_t * udpdb = (udpdb_t *) arg;
-  int lWriteBlock = 0;
-  int a;
-  
-  while (!quit_threads)
-  {
-
-    while (!doWrite) {
-      a=1;
-    }
-    
-    syslog(LOG_INFO,"writing block...");
-    
-    memcpy(udpdb->block, udpdb->tblock + lWriteBlock*udpdb->hdu_bufsz, udpdb->hdu_bufsz);
-    
-    if (dsaX_udpdb_new_buffer (udpdb) < 0)
-      {
-	syslog(LOG_ERR, "receive_obs: dsaX_udpdb_new_buffer failed");
-	return EXIT_FAILURE;
-      }
-    
-    doWrite=0;
-    if (lWriteBlock==0) lWriteBlock=1;
-    else lWriteBlock=0;
-     
-  }
-
-}
-
-
-	    
-// MAIN of program
-	
-int main (int argc, char *argv[]) {
-
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_capture_thread", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit for writing */
-  dada_hdu_t* hdu_out = 0;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // input data block HDU key
-  key_t out_key = CAPTURE_BLOCK_KEY;
-
-  // command line arguments
-  int core = -1;
-  int chgroup = 0;
-  int arg=0;
-  char dada_fnam[200]; // filename for dada header
-  char iface[100]; // IP for data packets
-  
-  while ((arg=getopt(argc,argv,"c:j:i:f:o:g:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {	      
-	      strcpy(iP,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'g':
-	  if (optarg)
-	    {	      
-	      chgroup = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-g flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {	      
-	      strcpy(iface,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }      	
-	case 'f':
-	  if (optarg)
-	    {	      
-	      strcpy(dada_fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	 
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // record STATE info
-  sprintf(STATE,"NOBUFFER");
-
-  // START THREADS
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id, stats_thread_id;
-  if (DEBUG)
-    syslog (LOG_DEBUG, "Creating threads");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created control thread, listening on %s:%d",iP,CAPTURE_CONTROL_PORT);
-
-  // start the stats thread
-  rval = pthread_create (&stats_thread_id, 0, (void *) stats_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating stats_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started stats_thread()");
-
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  // initialize the data structure
-  syslog (LOG_INFO, "main: dsaX_udpdb_init_receiver()");
-  if (dsaX_udpdb_init_receiver (&udpdb) < 0)
-  {
-    syslog (LOG_ERR, "could not initialize receiver");
-    return EXIT_FAILURE;
-  }
-  
-  
-  // OPEN CONNECTION TO DADA DB FOR WRITING
-
-  if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
-  
-  hdu_out  = dada_hdu_create (0);
-  if (DEBUG) syslog(DEBUG,"Created hdu");
-  dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog(LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (DEBUG) syslog(LOG_DEBUG,"Connected HDU");
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_out);
-    syslog(LOG_ERR,"could not lock to output dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  syslog(LOG_INFO,"opened connection to output DB");
-
-  // DEAL WITH DADA HEADER
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-  if (DEBUG) syslog(DEBUG,"read header2");
-
-  if (fileread (dada_fnam, hout, 4096) < 0)
-    {
-      free (hout);
-      syslog (LOG_ERR, "could not read ASCII header from %s", dada_fnam);
-      return (EXIT_FAILURE);
-    }
-
-  
-  if (DEBUG) syslog(DEBUG,"read header3");
-
-  
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-
-  
-  // copy the in header to the out header
-  memcpy (header_out, hout, 4096);
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, 4096) < 0)
-    {
-      syslog(LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  sprintf(STATE,"LISTEN");
-  syslog(LOG_INFO,"marked output header block as filled - now in LISTEN state");
-
-
-  /* time to start up receiver. 
-     data are captured on iface:CAPTURE_PORT 
-  */
-
-  printf("here\n");
-  
-  
-  // put information in udpdb struct
-  udpdb.hdu = hdu_out;
-  udpdb.port = CAPTURE_PORT;
-  udpdb.interface = strdup(iface);
-  udpdb.hdu_bufsz = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);  
-  char * tblock = (char *)malloc(sizeof(char)*udpdb.hdu_bufsz);
-  udpdb.tblock = tblock;
-  // determine number of packets per block, must 
-  if (udpdb.hdu_bufsz % UDP_DATA != 0)
-  {
-    syslog(LOG_ERR, "data block size for [%"PRIu64"] was not a multiple of the UDP_DATA size [%d]\n", udpdb.hdu_bufsz, UDP_DATA);
-    return EXIT_FAILURE;
-  }
-  udpdb.packets_per_buffer = udpdb.hdu_bufsz / UDP_DATA;  
-  udpdb.bytes_to_acquire = 0;
-  udpdb.num_inputs = NSNAPS;
-
-  // prepare the socket
-  syslog(LOG_INFO, "main: dsaX_udpdb_prepare()");
-  if (dsaX_udpdb_prepare (&udpdb) < 0)
-  {
-    syslog(LOG_ERR, "could allocate required resources (prepare)");
-    return EXIT_FAILURE;
-  }
-  
-  // reset the receiver
-  syslog(LOG_INFO, "main: dsaX_udpdb_reset_receiver()");
-  dsaX_udpdb_reset_receiver (&udpdb);
-
-  // open a block of the data block, ready for writing
-  if (dsaX_udpdb_open_buffer (&udpdb) < 0)
-  {
-    syslog (LOG_ERR, "start: dsaX_udpdb_open_buffer failed");
-    return -1;
-  }
-
-
-  // start threads
-
-  // start recv thread
-  rval = 0;
-  pthread_t recv_thread_id, write_thread_id;
-  rval = pthread_create (&recv_thread_id, 0, (void *) recv_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating recv_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "Created recv thread");
-
-  // start the write thread
-  rval = pthread_create (&write_thread_id, 0, (void *) write_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating write_thread: %s", strerror(rval));
-    return -1;
-  }
-  syslog(LOG_NOTICE, "started write_thread()");  
-
-  while (!quit_threads) {
-    sleep(1);
-  }
-  
-  // close threads
-  syslog(LOG_INFO, "joining all threads");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  pthread_join (stats_thread_id, &result);
-  pthread_join (recv_thread_id, &result);
-  pthread_join (write_thread_id, &result);
-  
-  free(tblock);
-  
-  dsaX_dbgpu_cleanup (hdu_out);
-
-}
diff --git a/src/dsaX_copydb.c b/src/dsaX_copydb.c
deleted file mode 100644
index 7714038..0000000
--- a/src/dsaX_copydb.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block;
-  uint64_t written, block_id;
-
-
-  // set up
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    written = ipcio_write (hdu_out->data_block, block, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_cuda_correlator.cu b/src/dsaX_cuda_correlator.cu
deleted file mode 100644
index 3bebd09..0000000
--- a/src/dsaX_cuda_correlator.cu
+++ /dev/null
@@ -1,309 +0,0 @@
-// -*- c++ -*-
-/* will run xgpu */
-/* assumes input block size is appropriate */
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <thrust/fill.h>
-#include <thrust/device_vector.h>
-#include <thrust/sequence.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/scatter.h>
-
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-//#include "cube/cube.h"
-#include "xgpu.h"
- 
-
-#ifdef __MACH__
-#include <mach/mach_time.h>
-#define CLOCK_REALTIME 0
-#define CLOCK_MONOTONIC 0
-int clock_gettime(int clk_id, struct timespec *t){
-    mach_timebase_info_data_t timebase;
-    mach_timebase_info(&timebase);
-    uint64_t time;
-    time = mach_absolute_time();
-    double nseconds = ((double)time * (double)timebase.numer)/((double)timebase.denom);
-    double seconds = ((double)time * (double)timebase.numer)/((double)timebase.denom * 1e9);
-    t->tv_sec = seconds;
-    t->tv_nsec = nseconds;
-    return 0;
-}
-#else
-#include <time.h>
-#endif
-
-/*
-  Data ordering for input vectors is (running from slowest to fastest)
-  [time][channel][station][polarization][complexity]
-
-  Output matrix has ordering
-  [channel][station][station][polarization][polarization][complexity]
-*/
-
-int main(int argc, char** argv) {
-
-  int opt;
-  int i, j;
-  int device = 0;
-  unsigned int seed = 1;
-  int outer_count = 1;
-  int count = 1;
-  int syncOp = SYNCOP_SYNC_TRANSFER;
-  int finalSyncOp = SYNCOP_DUMP;
-  int verbose = 0;
-  int hostAlloc = 0;
-  XGPUInfo xgpu_info;
-  unsigned int npol, nstation, nfrequency;
-  int xgpu_error = 0;
-  Complex *omp_matrix_h = NULL;
-  struct timespec outer_start, start, stop, outer_stop;
-  double total, per_call, max_bw, gbps;
-#ifdef RUNTIME_STATS
-  struct timespec tic, toc;
-#endif
-
-  while ((opt = getopt(argc, argv, "C:c:d:f:ho:rs:v:")) != -1) {
-    switch (opt) {
-      case 'c':
-        // Set number of time to call xgpuCudaXengine
-        count = strtoul(optarg, NULL, 0);
-        if(count < 1) {
-          fprintf(stderr, "count must be positive\n");
-          return 1;
-        }
-        break;
-      case 'C':
-        // Set number of time to call xgpuCudaXengine
-        outer_count = strtoul(optarg, NULL, 0);
-        if(outer_count < 1) {
-          fprintf(stderr, "outer count must be positive\n");
-          return 1;
-        }
-        break;
-      case 'd':
-        // Set CUDA device number
-        device = strtoul(optarg, NULL, 0);
-        break;
-      case 'f':
-        // Set syncOp for final call
-        finalSyncOp = strtoul(optarg, NULL, 0);
-        break;
-      case 'o':
-        // Set syncOp
-        syncOp = strtoul(optarg, NULL, 0);
-        break;
-      case 'r':
-        // Register host allocated memory
-        hostAlloc = 1;
-        break;
-      case 's':
-        // Set seed for random data
-        seed = strtoul(optarg, NULL, 0);
-        break;
-      case 'v':
-        // Set verbosity level
-        verbose = strtoul(optarg, NULL, 0);
-        break;
-      default: /* '?' */
-        fprintf(stderr,
-            "Usage: %s [options]\n"
-            "Options:\n"
-            "  -c INTEG_CALLS    Calls to xgpuCudaXengine per integration [1]\n"
-            "  -C INTEG_COUNT    Number of integrations [1]\n"
-            "  -d DEVNUM         GPU device to use [0]\n"
-            "  -f FINAL_SYNCOP   Sync operation for final call [1]\n"
-            "  -o SYNCOP         Sync operation for all but final call [1]\n"
-            "                    Sync operation values are:\n"
-            "                         0 (no sync)\n"
-            "                         1 (sync and dump)\n"
-            "                         2 (sync host to device transfer)\n"
-            "                         3 (sync kernel computations)\n"
-            "  -r                Register host allocated memory [false]\n"
-            "                    (otherwise use CUDA allocated memory)\n"
-            "  -s SEED           Random number seed [1]\n"
-            "  -v {0|1|2|3}      Verbosity level (debug only) [0]\n"
-            "  -h                Show this message\n",
-            argv[0]);
-        exit(EXIT_FAILURE);
-    }
-  }
-
-  srand(seed);
-
-  // Get sizing info from library
-  xgpuInfo(&xgpu_info);
-  npol = xgpu_info.npol;
-  nstation = xgpu_info.nstation;
-  nfrequency = xgpu_info.nfrequency;
-
-  printf("Correlating %u stations with %u channels and integration length %u\n",
-	 xgpu_info.nstation, xgpu_info.nfrequency, xgpu_info.ntime);
-#ifndef FIXED_POINT
-  printf("Sending floating point data to GPU.\n");
-#else
-  printf("Sending fixed point data to GPU.\n");
-#endif
-
-  // perform host memory allocation
-
-  // allocate the GPU X-engine memory
-  XGPUContext context;
-  context.array_len = xgpu_info.vecLength;
-  context.matrix_len = xgpu_info.matLength;
-  context.array_h = NULL;
-  context.matrix_h = NULL;
-
-  xgpu_error = xgpuInit(&context, device);
-
-  ComplexInput *array_h = context.array_h; // this is pinned memory
-  Complex *cuda_matrix_h = context.matrix_h;
-
-  // create an array of complex noise
-  xgpuRandomComplex(array_h, xgpu_info.vecLength);
-
-  xgpuSwizzleInput(context.array_h, array_h);
-
-  // try copying to GPU
-  ComplexInput *array_hd;
-  cudaMalloc((void **)&array_hd, context.array_len*sizeof(ComplexInput));
-  cudaMemcpy(array_hd,context.array_h,context.array_len*sizeof(ComplexInput),cudaMemcpyHostToDevice);
-
-  // ompXengine always uses TRIANGULAR_ORDER
-  unsigned int ompMatLength = nfrequency * ((nstation+1)*(nstation/2)*npol*npol);
-  omp_matrix_h = (Complex *) malloc(ompMatLength*sizeof(Complex));
-  if(!omp_matrix_h) {
-    fprintf(stderr, "error allocating output buffer for xgpuOmpXengine\n");
-    goto cleanup;
-  }
-
-#if (CUBE_MODE == CUBE_DEFAULT && !defined(POWER_LOOP) )
-  // Only call CPU X engine if dumping GPU X engine exactly once
-  if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) {
-    printf("Calling CPU X-Engine\n");
-    xgpuOmpXengine(omp_matrix_h, array_h);
-  }
-#endif
-
-#define ELAPSED_MS(start,stop) \
-  ((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6)
-
-  printf("Calling GPU X-Engine\n");
-  clock_gettime(CLOCK_MONOTONIC, &outer_start);
-  for(j=0; j<outer_count; j++) {
-    clock_gettime(CLOCK_MONOTONIC, &start);
-    for(i=0; i<count; i++) {
-#ifdef RUNTIME_STATS
-      clock_gettime(CLOCK_MONOTONIC, &tic);
-#endif
-      //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
-      xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp);
-#ifdef RUNTIME_STATS
-      clock_gettime(CLOCK_MONOTONIC, &toc);
-#endif
-      if(xgpu_error) {
-        fprintf(stderr, "xgpuCudaXengine returned error code %d\n", xgpu_error);
-        goto cleanup;
-      }
-#ifdef RUNTIME_STATS
-      fprintf(stderr, "%11.6f  %11.6f ms%s\n",
-          ELAPSED_MS(start,tic), ELAPSED_MS(tic,toc),
-          i==count-1 ? " final" : "");
-#endif
-    }
-    clock_gettime(CLOCK_MONOTONIC, &stop);
-    total = ELAPSED_MS(start,stop);
-    per_call = total/count;
-    // per_spectrum = per_call / NTIME
-    // per_channel = per_spectrum / NFREQUENCY
-    //             = per_call / (NTIME * NFREQUENCY)
-    // max_bw (kHz)  = 1 / per_channel = (NTIME * NFREQUENCY) / per_call
-    max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz
-    gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count)) / total / 1e6; // Gbps
-    printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
-        total, per_call);
-    printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
-        max_bw, gbps);
-  }
-  if(outer_count > 1) {
-    clock_gettime(CLOCK_MONOTONIC, &outer_stop);
-    total = ELAPSED_MS(outer_start,outer_stop);
-    per_call = total/(count*outer_count);
-    // per_spectrum = per_call / NTIME
-    // per_channel = per_spectrum / NFREQUENCY
-    //             = per_call / (NTIME * NFREQUENCY)
-    // max_bw (kHz)  = 1 / per_channel = (NTIME * NFREQUENCY) / per_call
-    max_bw = xgpu_info.ntime*xgpu_info.nfrequency/per_call/1000; // MHz
-    gbps = ((float)(8 * context.array_len * sizeof(ComplexInput) * count * outer_count)) / total / 1e6; // Gbps
-    printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
-        total, per_call);
-    printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
-        max_bw, gbps);
-  }
-
-#if (CUBE_MODE == CUBE_DEFAULT)
-  
-  // Only compare CPU and GPU X engines if dumping GPU X engine exactly once
-  if(finalSyncOp == SYNCOP_DUMP && count*outer_count == 1) {
-    xgpuReorderMatrix(cuda_matrix_h);
-    xgpuCheckResult(cuda_matrix_h, omp_matrix_h, verbose, array_h);
-  }
-
-#if 0
-  int fullMatLength = nfrequency * nstation*nstation*npol*npol;
-  Complex *full_matrix_h = (Complex *) malloc(fullMatLength*sizeof(Complex));
-
-  // convert from packed triangular to full matrix
-  xgpuExtractMatrix(full_matrix_h, cuda_matrix_h);
-
-  free(full_matrix_h);
-#endif
-#endif
-
-cleanup:
-  //free host memory
-  free(omp_matrix_h);
-
-  // free gpu memory
-  xgpuFree(&context);
-  cudaFree(array_hd);
-
-#ifdef DP4A
-  free(array_h);
-#endif
-
-  /*  if(hostAlloc) {
-    free(context.array_h);
-    free(context.matrix_h);
-    }*/
-
-  return xgpu_error;
-}
diff --git a/src/dsaX_dbnic.c b/src/dsaX_dbnic.c
deleted file mode 100644
index 83e3e4a..0000000
--- a/src/dsaX_dbnic.c
+++ /dev/null
@@ -1,435 +0,0 @@
-/* simple nicdb
-
-will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now  
-
-*/
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-
-// data to pass to threads
-struct data {
-  char * out;
-  int sockfd;
-  struct sockaddr_in si_other;
-  int thread_id;
-  int chgroup;
-  int tseq;
-};
-
-/* global variables */
-int DEBUG = 0;
-int TEST = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_dbnic [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -g chgroup [default 0]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t TEST\n"
-	   " -i in_key [default BF_BLOCK_KEY]\n"
-	   " -w -x -y -z four ip addresses for corner turn\n"
-	   " -h print usage\n");
-}
-
-/* thread for data transmission */
-void * transmit(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int sockfd = d->sockfd;
-  struct sockaddr_in si_other = d->si_other;
-  char * output = (char *)(d->out);
-  int chgroup = d->chgroup;
-  int tseq = d->tseq;
-  char * packet = (char *)malloc(sizeof(char)*P_SIZE);
-  int * ipacket = (int *)(packet);
-
-
-  // for test packet
-  if (tseq==-1) {
-
-    ipacket[0] = chgroup;
-    sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other));
-
-  }
-  else {
-  
-    // fill op, doing transpose
-    char * op = (char *)malloc(sizeof(char)*(NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
-    //iop[0] = chgroup;
-    //iop[1] = tseq;
-    for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
-      for (int j=0;j<NBEAMS_PER_BLOCK;j++) {
-	for (int k=0;k<NW;k++) 
-	  // op[8+i*NBEAMS_PER_BLOCK*NW+j*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // no transpose
-	  op[j*NSAMPS_PER_TRANSMIT*NW+i*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // yes transpose
-      }
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"sending with chgroup %d tseq %d",chgroup,tseq);
-
-    // do transmit
-    // each packet is 12 bytes of header plus 8192 bytes of data
-    int val;
-    for (int i=0;i<NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12);i++) {
-
-      ipacket[0] = chgroup;
-      ipacket[1] = tseq;
-      ipacket[2] = i;
-      memcpy(packet+12,op+i*(P_SIZE-12),P_SIZE-12);
-      sendto(sockfd,packet,P_SIZE,0,(struct sockaddr *)&si_other,sizeof(si_other));
-
-      //for (int ti=0;ti<NWAIT;ti++) val = ti*ti;
-      usleep(180);
-      
-    }
-    
-    if (DEBUG) syslog(LOG_INFO,"thread %d: written output",thread_id);
-
-    free(op);
-
-  }
-  
-  /* return 0 */
-  free(packet);
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_dbnic", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // threads
-  struct data args[4];
-  pthread_t threads[4];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  
-  // command line arguments
-  int core = -1;
-  int chgroup = 0;
-  int arg = 0;
-  char iP[4][20] = {"10.41.0.114", "10.41.0.87", "10.41.0.66", "10.41.0.60"}; 
-  // data block HDU keys
-  key_t in_key;
-  in_key = BF_BLOCK_KEY;
-
-  
-  while ((arg=getopt(argc,argv,"c:g:ti:w:x:y:z:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'g':
-	  if (optarg)
-	    {
-	      chgroup = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'w':
-	  strcpy(iP[0],optarg);
-	  break;
-	case 'x':
-	  strcpy(iP[1],optarg);
-	  break;
-	case 'y':
-	  strcpy(iP[2],optarg);
-	  break;
-	case 'z':
-	  strcpy(iP[3],optarg);
-	  break;
-	case 't':
-	  TEST=1;
-	  syslog (LOG_INFO, "Will use test pattern");
-	  break;
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-  	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size);
-  uint64_t  bytes_read = 0;
-  char *block;
-  uint64_t written, block_id;
-
-  
-  // set up
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-  int nthreads = NBMS / NBEAMS_PER_BLOCK;
-  
-  
-  // create socket connections
-  int sockfd[nthreads];
-  struct sockaddr_in servaddr[nthreads];
-
-  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
-  for (int i=0;i<nthreads;i++) {
-    memset((char *) &servaddr[i], 0, sizeof(servaddr[i]));
-    servaddr[i].sin_family = AF_INET;
-    servaddr[i].sin_addr.s_addr = inet_addr(iP[i]);
-    servaddr[i].sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
-  }
-  if (DEBUG) syslog(LOG_INFO,"sockets created");  
-
-  // send test packets
-
-  // put together args
-  for (int i=0; i<nthreads; i++) {
-    args[i].sockfd = sockfd[i];
-    args[i].si_other = servaddr[i];
-    args[i].thread_id = i;
-    args[i].chgroup = chgroup;
-    args[i].tseq = -1;
-  }
-  
-  for(int i=0; i<nthreads; i++){
-    if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
-      syslog(LOG_ERR,"Failed to create massage thread %d", i);
-    }
-  }
-  
-  pthread_attr_destroy(&attr);
-  
-  for(int i=0; i<nthreads; i++){
-    pthread_join(threads[i], &result);
-  }
-  
-  syslog(LOG_INFO,"Sent test packets");
-  
-  /*
-  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_STREAM, 0);
-  if (DEBUG) syslog(LOG_DEBUG,"sockets created");
-  for (int i=0;i<nthreads;i++) {
-    bzero(&servaddr, sizeof(servaddr));
-    servaddr.sin_family = AF_INET;
-    servaddr.sin_addr.s_addr = inet_addr(iP[i]);
-    servaddr.sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
-    if (connect(sockfd[i], (struct sockaddr *)&servaddr, sizeof(servaddr)) != 0) {
-      syslog(LOG_ERR,"connection with the server failed %d",i);
-      exit(0);
-    }
-    if (DEBUG) syslog(LOG_DEBUG,"connected %d",i);
-    }*/
-  
-  syslog(LOG_INFO, "starting observation");
-
-  /*
-  block has size/shape [NSAMPS_PER_TRANSMIT, NBMS, NW]
-  want to transmit [NBEAMS_PER_BLOCK, NSAMPS_PER_TRANSMIT, NW]
-  for test tone, populate with chgroup*10 + beam*NBMS/NBEAMS_PER_BLOCK + time*2/NSAMPS_PER_TRANSMIT
-  */
-  unsigned char * testblock = (unsigned char *)malloc(sizeof(unsigned char)*block_size);
-  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
-    for (int j=0;j<NBMS;j++) {
-      for (int k=0;k<NW;k++)
-	testblock[i*NBMS*NW + j*NW + k] = (unsigned char)(i/2);
-    }
-  }
-  
-  
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-
-    // put together args
-    for (int i=0; i<nthreads; i++) {
-      if (TEST) args[i].out = testblock;
-      else args[i].out = block;
-      args[i].sockfd = sockfd[i];
-      args[i].si_other = servaddr[i];
-      args[i].thread_id = i;
-      args[i].chgroup = chgroup;
-      args[i].tseq = blocks;
-    }
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
-	syslog(LOG_ERR,"Failed to create massage thread %d", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    //if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      //if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  for (int i=0;i<nthreads;i++) close(sockfd[i]);
-  free(testblock);
-  dsaX_dbgpu_cleanup (hdu_in);
-  
-}
-
-
diff --git a/src/dsaX_dbnic.c.bak b/src/dsaX_dbnic.c.bak
deleted file mode 100644
index 366f4c8..0000000
--- a/src/dsaX_dbnic.c.bak
+++ /dev/null
@@ -1,381 +0,0 @@
-/* simple nicdb
-
-will work on NBMS/NBEAMS_PER_BLOCK writers, ip addresses set in code for now  
-
-*/
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-
-// data to pass to threads
-struct data {
-  char * out;
-  int sockfd; 
-  int thread_id;
-  int chgroup;
-  int tseq;
-};
-
-/* global variables */
-int DEBUG = 0;
-int TEST = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_dbnic [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -g chgroup [default 0]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t TEST\n"
-	   " -i in_key [default BF_BLOCK_KEY]\n"
-	   " -w -x -y -z four ip addresses for corner turn\n"
-	   " -h print usage\n");
-}
-
-/* thread for data transmission */
-void * transmit(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int sockfd = d->sockfd; 
-  char * output = (char *)(d->out);
-  char * op = (char *)malloc(sizeof(char)*(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
-  int * iop = (int *)(op);
-  int chgroup = d->chgroup;
-  int tseq = d->tseq;
-
-  // fill op, doing transpose
-  iop[0] = chgroup;
-  iop[1] = tseq;
-  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
-    for (int j=0;j<NBEAMS_PER_BLOCK;j++) {
-      for (int k=0;k<NW;k++) 
-	// op[8+i*NBEAMS_PER_BLOCK*NW+j*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // no transpose
-	op[8+j*NSAMPS_PER_TRANSMIT*NW+i*NW+k] = output[i*NBMS*NW + thread_id*NBEAMS_PER_BLOCK*NW + j*NW+k]; // yes transpose
-    }
-  }
-
-  if (DEBUG) syslog(LOG_DEBUG,"sending with chgroup %d tseq %d",iop[0],iop[1]);
-  
-  // do transmit
-  int remain_data = (int)((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
-  int sent_bytes = 0, sbytes;
-  /*while (((sbytes = send(sockfd, op + sent_bytes, remain_data, 0))>0) && (remain_data > 0)) {
-    remain_data -= sbytes;
-    sent_bytes += sbytes;
-    }*/
-  sbytes = send(sockfd, op, remain_data, 0);
-  if (sbytes<remain_data)
-    syslog(LOG_ERR,"thread %d: only sent %d of %d",thread_id,sbytes,remain_data);
-
-  
-
-
-  //  write(sockfd, op, sizeof(op));
-
-  if (DEBUG) syslog(LOG_DEBUG,"thread %d: written output",thread_id);
-  
-  /* return 0 */
-  free(op);
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_dbnic", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // threads
-  struct data args[4];
-  pthread_t threads[4];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  
-  // command line arguments
-  int core = -1;
-  int chgroup = 0;
-  int arg = 0;
-  char iP[4][20] = {"10.41.0.22", "10.41.0.98", "10.41.0.105", "10.41.0.63"}; 
-  // data block HDU keys
-  key_t in_key;
-  in_key = BF_BLOCK_KEY;
-
-  
-  while ((arg=getopt(argc,argv,"c:g:ti:w:x:y:z:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'g':
-	  if (optarg)
-	    {
-	      chgroup = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'w':
-	  strcpy(iP[0],optarg);
-	  break;
-	case 'x':
-	  strcpy(iP[1],optarg);
-	  break;
-	case 'y':
-	  strcpy(iP[2],optarg);
-	  break;
-	case 'z':
-	  strcpy(iP[3],optarg);
-	  break;
-	case 't':
-	  TEST=1;
-	  syslog (LOG_INFO, "Will use test pattern");
-	  break;
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-  	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-  
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size);
-  uint64_t  bytes_read = 0;
-  char *block;
-  uint64_t written, block_id;
-
-  
-  // set up
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-  int nthreads = NBMS / NBEAMS_PER_BLOCK;
-  
-  
-  // create socket connections
-  int sockfd[nthreads];
-  struct sockaddr_in servaddr;
-  for (int i=0;i<nthreads;i++) sockfd[i] = socket(AF_INET, SOCK_STREAM, 0);
-  if (DEBUG) syslog(LOG_DEBUG,"sockets created");
-  for (int i=0;i<nthreads;i++) {
-    bzero(&servaddr, sizeof(servaddr));
-    servaddr.sin_family = AF_INET;
-    servaddr.sin_addr.s_addr = inet_addr(iP[i]);
-    servaddr.sin_port = htons(FIL_PORT0+(uint16_t)(chgroup));
-    if (connect(sockfd[i], (struct sockaddr *)&servaddr, sizeof(servaddr)) != 0) {
-      syslog(LOG_ERR,"connection with the server failed %d",i);
-      exit(0);
-    }
-    if (DEBUG) syslog(LOG_DEBUG,"connected %d",i);
-  }
-  
-  syslog(LOG_INFO, "starting observation");
-
-  /*
-  block has size/shape [NSAMPS_PER_TRANSMIT, NBMS, NW]
-  want to transmit [NBEAMS_PER_BLOCK, NSAMPS_PER_TRANSMIT, NW]
-  for test tone, populate with chgroup*10 + beam*NBMS/NBEAMS_PER_BLOCK + time*2/NSAMPS_PER_TRANSMIT
-  */
-  unsigned char * testblock = (unsigned char *)malloc(sizeof(unsigned char)*block_size);
-  for (int i=0;i<NSAMPS_PER_TRANSMIT;i++) {
-    for (int j=0;j<NBMS;j++) {
-      for (int k=0;k<NW;k++)
-	testblock[i*NBMS*NW + j*NW + k] = (unsigned char)(i/2);
-    }
-  }
-  
-  
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-
-    // put together args
-    for (int i=0; i<nthreads; i++) {
-      if (TEST) args[i].out = testblock;
-      else args[i].out = block;
-      args[i].sockfd = sockfd[i];
-      args[i].thread_id = i;
-      args[i].chgroup = chgroup;
-      args[i].tseq = blocks;
-    }
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &transmit, (void *)(&args[i]))) {
-	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    //if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      //if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  for (int i=0;i<nthreads;i++) close(sockfd[i]);
-  free(testblock);
-  dsaX_dbgpu_cleanup (hdu_in);
-  
-}
-
-
diff --git a/src/dsaX_fake.c b/src/dsaX_fake.c
deleted file mode 100644
index 662ea37..0000000
--- a/src/dsaX_fake.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f file to read packet from [default none]\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      useZ = 0;
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  uint64_t npackets = block_out / 4608;
-  char * block, * output_buffer;
-  char * packet;
-  packet = (char *)malloc(sizeof(char)*4608);
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // fill output buffer if file exists
-  FILE *fin;
-  if (!useZ) {
-
-    if (!(fin=fopen(fnam,"rb"))) {
-      syslog(LOG_ERR, "cannot open file - will write zeros");
-    }
-    else {
-
-      fread(packet,4608,1,fin);
-      fclose(fin);
-
-      syslog(LOG_INFO,"Read packet, npackets %lu",npackets);
-      
-      for (int i=0;i<npackets;i++)
-	memcpy(output_buffer+i*4608,packet,4608);
-
-      syslog(LOG_INFO, "Using input packet");
-      
-    }
-
-    
-  }
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-    // no need to do anything here - output_buffer is ready to go
-
-    // write to output
-    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(packet);
-  free(output_buffer);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_filTrigger.c b/src/dsaX_filTrigger.c
deleted file mode 100644
index 55f95fd..0000000
--- a/src/dsaX_filTrigger.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/* Code to read from a single dada buffer, and write to disk upon receiving
-a trigger. Uses pthread threads and shared memory to listen. 
-Sequence of events:
- - starts null-reading dump buffer, while listening for socket command
-   + for N second dump, assume N-second dada blocks
- - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. 
- - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending.
-*/
-
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-#include "dsaX_capture.h"
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-uint64_t specnum = 0;
-uint64_t next_specnum = 0;
-uint64_t procnum = 0;
-int trignum = 0;
-int dumpnum = 0;
-char iP[100];
-char footer_buf[1024];
-char next_footer_buf[1024];
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-int dada_bind_thread_to_core (int core);
-
-FILE *output;
-
-void send_string(char *string) /* includefile */
-{
-  int len;
-  len=strlen(string);
-  fwrite(&len, sizeof(int), 1, output);
-  fwrite(string, sizeof(char), len, output);
-}
-
-void send_float(char *name,float floating_point) /* includefile */
-{
-  send_string(name);
-  fwrite(&floating_point,sizeof(float),1,output);
-}
-
-void send_double (char *name, double double_precision) /* includefile */
-{
-  send_string(name);
-  fwrite(&double_precision,sizeof(double),1,output);
-}
-
-void send_int(char *name, int integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(int),1,output);
-}
-
-void send_char(char *name, char integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(char),1,output);
-}
-
-
-void send_long(char *name, long integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(long),1,output);
-}
-
-void send_coords(double raj, double dej, double az, double za) /*includefile*/
-{
-  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
-  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
-  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
-  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
-}
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in)
-{
-  
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_filTrigger [options]\n"
-	   " -c core   bind process to CPU core\n"
-	   " -i IP to listen to [no default]\n"
-	   " -j in_key [default eaea]\n"
-	   " -d debug\n"
-	   " -n output file name base [no default]\n"
-	   " -b beam number of first beam [default 0]\n"
-	   " -z respond to zero specnum\n"
-	   " -h print usage\n");
-}
-
-
-// Thread to control the dumping of data
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = ctx->control_port;
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,"11227",&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-    strcpy(tbuf,buffer);
-    trignum++;
-
-    // interpret buffer string
-    char * rest = buffer;
-    char tnam[100];
-    tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0));
-    strcpy(tnam,strtok_r(rest, "-", &rest));
-    
-    if (!dump_pending) {
-      //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
-      specnum = tmps/4;
-      strcpy(footer_buf,tnam);
-      syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf);
-    }
-	
-    if (dump_pending) {
-      syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam);
-      next_specnum = tmps/4;
-      strcpy(next_footer_buf,tnam);
-    }
-  
-    if (!dump_pending) dump_pending = 1;
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-  free (tbuf);
-
-  if (ctx->verbose)
-    syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-	    
-
-	
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_filTrigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-
-  /* port for control commands */
-  int control_port = TRIGGER_CONTROL_PORT;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // input data block HDU key
-  key_t in_key = 0x0000eaea;
-
-  // command line arguments
-  int core = -1;
-  int beamn = 0;
-  char of[200];
-  char foutnam[300];
-  char dirnam[300];
-  int rz=0;
-  int arg=0;
-
-  while ((arg=getopt(argc,argv,"i:c:j:db:n:hz")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -c flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'b':
-	  if (optarg)
-	    {
-	      beamn = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -b flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'n':
-	  if (optarg)
-	    {
-	      strcpy(of,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -n flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-	case 'z':
-	  rz=1;
-	  syslog (LOG_INFO, "Will respond to zero trigger");
-	  break;
-	case 'j':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // DADA stuff
-  
-  udpdb.verbose = DEBUG;
-  udpdb.control_port = control_port;
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  syslog(LOG_INFO, "starting control_thread()");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-
-  
-  syslog (LOG_INFO, "creating hdus");
-
-  // open connection to the in/read DBs
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      syslog(LOG_INFO,"binding to core %d", core);
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-    }
-
-  int observation_complete=0;
-  
-  // more DADA stuff - deal with headers
-  
-  uint64_t header_size = 0;
-
-  // read the header from the input HDU
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "main: could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  // mark the input header as cleared
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared [input]");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  
-  // stuff for writing data
-  /*
-    Data will have [64 beam, time, freq] for each block.
-    Need to extract 
-   */
-
-
-  
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  unsigned char * extData = (unsigned char *)malloc(sizeof(unsigned char)*NSAMPS_PER_BLOCK*NCHAN_FIL*NBEAMS_PER_BLOCK);
-  uint64_t specs_per_block = NSAMPS_PER_BLOCK;
-  uint64_t current_specnum = 0; // updates with each dada block read
-  uint64_t start_byte, bytes_to_copy, bytes_copied=0;
-  char * in_data;
-  uint64_t written=0;
-  uint64_t block_id, bytes_read=0;
-  int dumping = 0;
-  FILE *ofile;
-  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-  fprintf(ofile,"starting...\n");
-  fclose(ofile);
-
-
-  // main reading loop
-  float pc_full = 0.;
-  
-  syslog(LOG_INFO, "main: starting observation");
-
-  while (!observation_complete) {
-    
-    // read a DADA block
-    in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    
-    // add delay
-    // only proceed if input data block is 80% full
-    while (pc_full < 0.8) {
-      pc_full = ipcio_percent_full(hdu_in->data_block);
-      usleep(100);
-    }
-    pc_full = 0.;
-    
-    
-    // check for dump_pending
-    if (dump_pending) {
-      
-      // look after hand trigger
-      if (specnum==0 && rz==1) {
-	
-	specnum = current_specnum + 40000;
-	
-      }
-      
-      // if this is the first block to dump
-      if (specnum > current_specnum && specnum < current_specnum+specs_per_block) {
-	
-	dumping = 1;
-	syslog(LOG_INFO,"dumping is 1 -- first block");
-	
-	// loop over beams
-	bytes_to_copy = (NSAMPS_PER_BLOCK-(specnum-current_specnum))*NCHAN_FIL;
-	bytes_copied = bytes_to_copy;
-	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
-	  
-	  start_byte = i*NSAMPS_PER_BLOCK*NCHAN_FIL + (specnum-current_specnum)*NCHAN_FIL;
-	  memcpy(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL, in_data + start_byte, bytes_to_copy);
-	  
-	}
-	
-      }
-      
-      // if this is the last block to dump from
-      if (specnum + NSAMPS_PER_BLOCK > current_specnum && specnum + NSAMPS_PER_BLOCK <= current_specnum + specs_per_block && dumping==1) {	  
-
-	syslog(LOG_INFO,"in second block");
-	
-	// loop over beams
-	bytes_to_copy = NSAMPS_PER_BLOCK*NCHAN_FIL-bytes_copied;
-	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
-	  
-	  start_byte = i*NSAMPS_PER_BLOCK*NCHAN_FIL;
-	  memcpy(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL + bytes_copied, in_data + start_byte, bytes_to_copy);
-	  
-	}
-
-	syslog(LOG_INFO,"finished copying");
-	
-	// DO THE WRITING
-
-	sprintf(dirnam,"mkdir -p %s/%s",of,footer_buf);
-	system(dirnam);
-	
-	for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
-	  
-	  sprintf(foutnam,"%s/%s/%s_%d.fil",of,footer_buf,footer_buf,beamn+i);
-	  output = fopen(foutnam,"wb");
-	  
-	  send_string("HEADER_START");
-	  send_string("source_name");
-	  send_string(footer_buf);
-	  send_int("machine_id",1);
-	  send_int("telescope_id",82);
-	  send_int("data_type",1); // filterbank data
-	  send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
-	  send_double("foff",-0.244140625);
-	  send_int("nchans",1024);
-	  send_int("nbits",8);
-	  send_double("tstart",55000.0);
-	  send_double("tsamp",8.192e-6*8.*4.);
-	  send_int("nifs",1);
-	  send_string("HEADER_END");
-	  
-	  fwrite(extData + i*NSAMPS_PER_BLOCK*NCHAN_FIL,sizeof(unsigned char),NSAMPS_PER_BLOCK*NCHAN_FIL,output);
-	  
-	  fclose(output);
-	  
-	}
-	
-	syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
-	ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
-	fclose(ofile);
-	
-	dumpnum++;
-	
-	// reset
-	bytes_copied = 0;
-	dump_pending = 0;
-	dumping=0;
-
-	// deal with next specnum
-	if (next_specnum != 0) {
-	  specnum = next_specnum;
-	  strcpy(footer_buf,next_footer_buf);
-	  next_specnum = 0;
-	  dump_pending = 1;
-	}
-	
-      }
-      
-      // if trigger arrived too late
-      if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
-	
-	bytes_copied=0;
-	dump_pending=0;
-	
-      }
-      
-      
-    }
-    
-    // update current spec
-    if (DEBUG) syslog(LOG_INFO,"current_specnum %lu",current_specnum);
-    current_specnum += specs_per_block;
-    
-    
-    // for exiting
-    if (bytes_read < block_size) {
-      observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
-    }
-    
-    // close block for reading
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-
-  }
-
-
-  // close control thread
-  syslog(LOG_INFO, "joining control_thread");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-
-  free(extData);
-  dsaX_dbgpu_cleanup (hdu_in);
-
-}
diff --git a/src/dsaX_fluff.c b/src/dsaX_fluff.c
deleted file mode 100644
index 3e3f2d1..0000000
--- a/src/dsaX_fluff.c
+++ /dev/null
@@ -1,415 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  int n_threads;
-  int thread_id;
-  int debug;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[8] = {22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int dbg = d->debug;
-  int na = 64;
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-
-  // local array
-  int * fluffed_int = (int *)(in);
-  int * out_int = (int *)(out);
-  
-  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
-  int tile_size = 4; // set by benchmarking
-  for (int i_packet=NPACKETS*thread_id/nthreads;i_packet<NPACKETS*(thread_id+1)/nthreads;i_packet++) {
-
-    for (int i=0;i<NANTS;i+=tile_size) {
-      for (int j=0;j<384*2;j++) {
-	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
-      }
-    }
-
-  }
-
-  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: transposed",thread_id);
-
-   
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * blockie;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = block;
-      args[i].out = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].debug = 0;
-    }
-
-    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-    
-    // write to output
-    blockie = ipcio_open_block_write (hdu_out->data_block, &block_id);
-    memcpy(blockie, output_buffer, block_out);
-    ipcio_close_block_write(hdu_out->data_block, block_out);
-    
-    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    	
-    
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  
-}
-
-
diff --git a/src/dsaX_makeFil.c b/src/dsaX_makeFil.c
deleted file mode 100644
index e9d6e3c..0000000
--- a/src/dsaX_makeFil.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_copydb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block;
-  uint64_t written, block_id;
-
-
-  // set up
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-    // here is where we convert input voltage data to output filterbank data
-
-    
-    // write to output dada block
-    written = ipcio_write (hdu_out->data_block, block, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_merge.c b/src/dsaX_merge.c
deleted file mode 100644
index 7866d5f..0000000
--- a/src/dsaX_merge.c
+++ /dev/null
@@ -1,580 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int DEBUG = 0;
-int STATS = 0;
-const int nth = 4;
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * in2;
-  char * out;
-  int * ant_order1;
-  int * ant_order2;
-  int n_threads;
-  int thread_id;
-};
-int cores[4] = {17, 18, 37, 38};
-
-
-void * massage (void *args) {
-
-  struct data *d = args;
-  int thread_id = d->thread_id;
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-
-  // extract from input
-  char *in = (char *)d->in;
-  char *in2 = (char *)d->in2;
-  char *out = (char *)d->out;
-  int n_threads = d->n_threads;
-  int * ao1 = d->ant_order1;
-  int * ao2 = d->ant_order2;
-
-  uint64_t oidx, iidx, ncpy = 1536;
-
-  for (int i=thread_id*(2048/n_threads);i<(thread_id+1)*(2048/n_threads);i++) {
-    for (int j=0;j<3*NSNAPS/2;j++) {
-      iidx = i*(NSNAPS/2)*4608 + j*1536;
-      oidx = i*NSNAPS*4608 + ao1[j]*1536;
-      memcpy(out + oidx, in + iidx, ncpy);
-      oidx = i*NSNAPS*4608 + ao2[j]*1536;
-      memcpy(out + oidx, in2 + iidx, ncpy); 
-    }
-  }
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_split [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -m multithread write\n"
-	   " -i in_key\n"
-	   " -o out_key\n"
-	   " -j in_key2\n"
-	   " -h print usage\n");
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_merge", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_in2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURE_BLOCK_KEY;
-  key_t out_key = CAPTURED_BLOCK_KEY;
-  key_t in_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int mwrite = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:j:dmh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key2) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'm':
-	  mwrite=1;
-	  syslog (LOG_INFO, "Will do multithread write");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  hdu_in2  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in2, in_key2);
-  if (dada_hdu_connect (hdu_in2) < 0) {
-    syslog (LOG_ERR,"could not connect to input  buffer2");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read(hdu_in2) < 0) {
-    syslog (LOG_ERR, "could not lock to input buffer2");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  header_in = ipcbuf_get_next_read (hdu_in2->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in2->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_in2,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-
-  // sort out ant order
-  int * ao1, * ao2;
-  ao1 = (int *)malloc(sizeof(int)*48);
-  ao2 = (int *)malloc(sizeof(int)*48);
-  ao1[0] = 19;
-  ao1[1] = 20;
-  ao1[2] = 21;
-  ao1[3] = 25;
-  ao1[4] = 26;
-  ao1[5] = 27;
-  ao1[6] = 18;
-  ao1[7] = 17;
-  ao1[8] = 16;
-  ao1[9] = 12;
-  ao1[10] = 11;
-  ao1[11] = 45;
-  ao1[12] = 83;
-  ao1[13] = 10;
-  ao1[14] = 9;
-  ao1[15] = 6;
-  ao1[16] = 5;
-  ao1[17] = 4;
-  ao1[18] = 0;
-  ao1[19] = 84;
-  ao1[20] = 85;
-  ao1[21] = 89;
-  ao1[22] = 90;
-  ao1[23] = 91;
-  ao1[24] = 39;
-  ao1[25] = 40;
-  ao1[26] = 41;
-  ao1[27] = 33;
-  ao1[28] = 34;
-  ao1[29] = 35;
-  ao1[30] = 42;
-  ao1[31] = 43;
-  ao1[32] = 44;
-  ao1[33] = 51;
-  ao1[34] = 52;
-  ao1[35] = 53;
-  ao1[36] = 57;
-  ao1[37] = 58;
-  ao1[38] = 59;
-  ao1[39] = 63;
-  ao1[40] = 64;
-  ao1[41] = 65;
-  ao1[42] = 69;
-  ao1[43] = 70;
-  ao1[44] = 71;
-  ao1[45] = 75;
-  ao1[46] = 76;
-  ao1[47] = 77;
-  ao2[0] = 22;
-  ao2[1] = 23;
-  ao2[2] = 24;
-  ao2[3] = 28;
-  ao2[4] = 29;
-  ao2[5] = 30;
-  ao2[6] = 15;
-  ao2[7] = 14;
-  ao2[8] = 13;
-  ao2[9] = 46;
-  ao2[10] = 47;
-  ao2[11] = 48;
-  ao2[12] = 82;
-  ao2[13] = 8;
-  ao2[14] = 7;
-  ao2[15] = 3;
-  ao2[16] = 2;
-  ao2[17] = 1;
-  ao2[18] = 86;
-  ao2[19] = 87;
-  ao2[20] = 88;
-  ao2[21] = 92;
-  ao2[22] = 93;
-  ao2[23] = 94;
-  ao2[24] = 95;
-  ao2[25] = 31;
-  ao2[26] = 32;
-  ao2[27] = 36;
-  ao2[28] = 37;
-  ao2[29] = 38;
-  ao2[30] = 81;
-  ao2[31] = 49;
-  ao2[32] = 50;
-  ao2[33] = 54;
-  ao2[34] = 55;
-  ao2[35] = 56;
-  ao2[36] = 60;
-  ao2[37] = 61;
-  ao2[38] = 62;
-  ao2[39] = 66;
-  ao2[40] = 67;
-  ao2[41] = 68;
-  ao2[42] = 72;
-  ao2[43] = 73;
-  ao2[44] = 74;
-  ao2[45] = 78;
-  ao2[46] = 79;
-  ao2[47] = 80;
-
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block1, * block2, * o1, * o2;
-  char * output = (char *)malloc(sizeof(char)*block_out);
-  uint64_t written, block_id;
-
-  // set up threads
-  struct data args[8];
-  pthread_t threads[8];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  // send through fake blocks
-
-  /*  if (fake>0) {
-    syslog(LOG_INFO,"sending %d fake blocks",fake);
-    for (int i=0;i<fake;i++) {
-      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
-      memcpy(o1, output, block_out);
-      ipcio_close_block_write (hdu_out->data_block, block_out);
-      usleep(10000);
-    }
-    syslog(LOG_INFO,"Finished with fake blocks");
-    }*/
-  
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    
-    block1 = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    block2 = ipcio_open_block_read (hdu_in2->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    
-    // DO STUFF
-
-    // copy to output buffer
-    
-    if (mwrite) {
-      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
-    }
-    
-    // set up data structure
-    for (int i=0; i<nth; i++) {
-      args[i].in = block1;
-      args[i].in2 = block2;
-      args[i].ant_order1 = ao1;
-      args[i].ant_order2 = ao2;
-      
-      if (mwrite) 
-	args[i].out = o1;	
-      else
-	args[i].out = output;
-
-      args[i].n_threads = nth;
-      args[i].thread_id = i;
-    }
-    
-    //syslog(LOG_INFO, "creating threads");
-    
-    for(int i=0; i<nth; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
-	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-    
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nth; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-    
-    
-    if (!mwrite) {
-      written = ipcio_write (hdu_out->data_block, output, block_out);
-    }
-    else {
-      ipcio_close_block_write (hdu_out->data_block, block_out);
-    }
-
-    if (blocks % 10 == 0)
-      syslog(LOG_INFO, "written block %d",blocks);      
-    blocks++;
-    
-    
-    if (bytes_read < block_size)
-      observation_complete = 1;            
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    ipcio_close_block_read (hdu_in2->data_block, bytes_read);
-
-  }
-
-  free(output);
-  free(ao1);
-  free(ao2);
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_in2,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  
-}
-
-
diff --git a/src/dsaX_nicdb.c b/src/dsaX_nicdb.c
deleted file mode 100644
index df47ebe..0000000
--- a/src/dsaX_nicdb.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
-https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading
-
-gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran
-
-the plan is to have NCLIENTS threads listening on different threads. 
-each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints
-the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly
-The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. 
-
-After a block is full, the data need to be written out (data rate 525 Mb/s)
-The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. 
-switch block when one block is being written out
-
-*/
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#define bdepth 16
-#define MAX_FULLBLOCK 4
-
-// global variables
-int DEBUG = 0;
-volatile int blockct[bdepth]; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW
-volatile int flush_flag = 0; // set to flush output2
-volatile int writing = 0;
-volatile int global_tseq = 0; // global count of full buffers
-int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to
-char iP[100];
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;	  
-
-// structure to pass to threads
-struct data
-{
-  char * output1;
-  char * output2;
-  uint16_t tport;
-  int thread_id;
-};
-
-// function prototypes
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-
-// receive process - runs infinite loop
-void * process(void * ptr)
-{
-
-  // arguments from structure
-  struct data *d = ptr;
-  int thread_id = d->thread_id;
-  char *output1 = (char *)d->output1;
-  char *output2 = (char *)d->output2;
-  uint16_t tport = d->tport;
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
-
-  // set up socket
-  struct sockaddr_in si_other, si_me;
-  int clientSocket, slen=sizeof(si_other);
-  clientSocket=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
-  if (DEBUG) syslog(LOG_INFO,"thread %d: Made socket",thread_id);
-  memset((char *) &si_me, 0, sizeof(si_me));
-  si_me.sin_family = AF_INET;
-  si_me.sin_port = htons(tport);
-  si_me.sin_addr.s_addr = inet_addr(iP);
-  if (bind(clientSocket, (struct sockaddr *)&si_me, sizeof(si_me)) < 0) {
-    syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id);
-    exit(1);
-  }
-  if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound - waiting for header packet",thread_id);
-
-  char * packet = (char *)malloc(sizeof(char)*P_SIZE);
-  int * ibuf;
-  recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen);
-  ibuf = (int *)(packet);
-  int chgroup = ibuf[0];
-  syslog(LOG_INFO,"thread %d: accepted connection from chgroup %d",thread_id,chgroup);
-
-  // data buffer and other variables
-  char * buffer = (char *)malloc((NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
-  int tseq, pseq;
-  int pct = 0;
-  int full_blocks = 0;
-  int fullBlock;
-  int i0, aa;
-  int lastPacket, nextBuf, current_tseq = 0, act_tseq; 
-  uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
-  uint64_t oidx_offset, oidx;
-  
-  // infinite loop 
-  while (1) {
-  
-    /* read message */
-    // fill up local buffer
-    lastPacket = 0;
-    nextBuf = 0;
-    while ((lastPacket==0) && (nextBuf==0)) {
-
-      recvfrom(clientSocket, packet, P_SIZE, 0,(struct sockaddr *)&si_other,&slen);
-      ibuf = (int *)(packet);
-      pseq = ibuf[2];
-      if (chgroup != ibuf[0]) 
-	syslog(LOG_ERR,"thread %d: received chgroup %d is not recorded %d",thread_id,ibuf[0],chgroup);
-      tseq = ibuf[1];
-
-      if (tseq>current_tseq) {
-	nextBuf=1;
-      }
-      else if (tseq==current_tseq) {
-	memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12);
-	pct++;
-      }
-
-      if (pseq==NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12)-1)
-	lastPacket=1;
-
-    }
-    
-    if (pct != NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12))
-      syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,pct,NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW/(P_SIZE-12));
-    
-    act_tseq = (current_tseq * NSAMPS_PER_TRANSMIT) % NSAMPS_PER_BLOCK; // place within output buffer
-
-    // at this stage we have a full local buffer
-    // this needs to be placed in the global buffer
-      
-    // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs
-    i0 = 0;
-    aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth);
-    oidx_offset = ((uint64_t)(aa))*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
-    //syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d current_tseq %d global_tseq %d position %d %"PRIu64"",thread_id,chgroup,tseq,current_tseq,global_tseq,aa,oidx_offset);
-    for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
-      for (int j=0;j<NSAMPS_PER_TRANSMIT;j++) {	
-	for (int k=0;k<NW;k++) {
-	  
-	  oidx = oidx_offset + i*NSAMPS_PER_BLOCK*NCHAN_FIL + (act_tseq+j)*NCHAN_FIL + CHOFF/8 + chgroup*NW + k;
-	  
-	  output1[oidx] = buffer[i0];
-
-	  i0++;
-	    
-	}
-      }
-    }
-    //syslog(LOG_INFO,"thread %d: entering mutex",thread_id);
-
-    // at this stage we have dealt with this capture round, and must address blockct within mutex
-    pthread_mutex_lock(&mutex);
-
-    // increment appropriate blockct
-    aa = ((current_tseq / (NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) % bdepth);
-    blockct[aa] += 1;
-    //syslog(LOG_INFO,"thread %d: incrementing blockct %d %d %d (total %d)",thread_id,current_tseq,aa,blockct[aa],NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT);
-
-    // deal with full block anywhere
-    full_blocks=0;
-    for (int i=0;i<bdepth;i++) {
-      if (blockct[i]!=0) full_blocks++;
-    }	
-    for (int i=0;i<bdepth;i++) {
-      if ((blockct[i] == NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) || (full_blocks>=MAX_FULLBLOCK && blockct[i] >= (NCLIENTS-1)*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT)) {
-
-	// need to write this block and reset blockct
-	while (flush_flag==1)
-	  aa==1;
-	flush_flag = 1;
-	blockct[i] = 0;
-	// log - hardcoded bdepth
-	full_blocks -= 1;
-	syslog(LOG_INFO,"thread %d: Writing global_tseq %d. Blockcts_full %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",thread_id,global_tseq,full_blocks,blockct[0],blockct[1],blockct[2],blockct[3],blockct[4],blockct[5],blockct[6],blockct[7],blockct[8],blockct[9],blockct[10],blockct[11],blockct[12],blockct[13],blockct[14],blockct[15]);
-
-	
-      }	
-
-    }
-        
-    pthread_mutex_unlock(&mutex);
-
-    // advance local tseq and deal with packet capture
-    if (lastPacket==1) {
-      current_tseq++;
-      lastPacket=0;
-      nextBuf=0;
-      pct=0;
-    }
-    if (nextBuf==1) {
-      current_tseq++;
-      memcpy(buffer+pseq*(P_SIZE-12),packet+12,P_SIZE-12);
-      pct=1;
-      lastPacket=0;
-    }
-
-    
-
-  }
-
-  /* close socket and clean up */
-  close(clientSocket);
-  free(packet);
-  free(buffer);
-  pthread_exit(0);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_nicdb [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -f header file [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n"
-	   " -i IP address\n"
-	   " -h print usage\n");
-}
-
-
-// main part of program 
-int main(int argc, char ** argv)
-{
-    
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  for (int i=0;i<bdepth;i++) blockct[i] = 0;
-
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t out_key = BEAMCAPTURE_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  char fnam[200];
-  
-  while ((arg=getopt(argc,argv,"c:f:o:i:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  // DADA stuff
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  // deal with headers
-  uint64_t header_size = 4096;
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  FILE *fin;
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"cannot open dada header file %s",fnam);
-    return EXIT_FAILURE;
-  }
-  fread(header_out, 4096, 1, fin);
-  fclose(fin);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }  
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-    
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out);
-  uint64_t  bytes_read = 0;
-  char *output1, *output2;
-  output1 = (char *)malloc(sizeof(char)*block_out*bdepth);
-  output2 = (char *)malloc(sizeof(char)*block_out);
-  memset(output1,0,block_out*bdepth);
-  memset(output2,0,block_out);
-  uint64_t written, block_id;
-
-  // set up threads
-  
-  // set up data structure
-  for (int i=0; i<NCLIENTS; i++) {
-    args[i].output1 = output1;
-    args[i].output2 = output2;
-    args[i].thread_id = i;
-    args[i].tport = FIL_PORT0 + (uint16_t)(i);
-  }
-
-  if (DEBUG) syslog(LOG_INFO,"creating %d threads (one per client)",NCLIENTS);
-    
-  for(int i=0; i<NCLIENTS; i++){
-    if (pthread_create(&threads[i], &attr, &process, (void *)(&args[i]))) {
-      syslog(LOG_ERR,"Failed to create thread %d\n", i);
-    }
-  }
-  pthread_attr_destroy(&attr);
-  if (DEBUG) syslog(LOG_INFO,"threads kinda running");
-  
-  int observation_complete=0;
-  int blocks = 0;
-  int aa;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // look for complete block
-
-    //if (DEBUG) syslog(LOG_INFO,"here with %d",blockct);
-    while (flush_flag==0)
-      aa=1;
-
-    // write to output
-    writing=1;
-    written = ipcio_write (hdu_out->data_block, output1 + (global_tseq % bdepth)*block_out, block_out);
-    global_tseq += 1;
-    writing=0;
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");	
-	dsaX_dbgpu_cleanup (hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    syslog(LOG_INFO, "written block %d",blocks);      
-    blocks++;
-
-    flush_flag = 0;
-
-  }
-      
-  
-  // free stuff
-  for(int i=0; i<NCLIENTS; i++){
-    pthread_join(threads[i], &result);
-    if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
-  }
-  free(output1);
-  free(output2);
-  dsaX_dbgpu_cleanup(hdu_out);
-  
-}
diff --git a/src/dsaX_nicdb.c.bak b/src/dsaX_nicdb.c.bak
deleted file mode 100644
index b309424..0000000
--- a/src/dsaX_nicdb.c.bak
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
-https://dzone.com/articles/parallel-tcpip-socket-server-with-multi-threading
-
-gcc -o test_ipcbuf test_ipcbuf.c -I/usr/local/psrdada/src -I/usr/local/include -L/usr/local/lib -lpsrdada -lm -pthread -g -O2 -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran
-
-the plan is to have NCLIENTS threads listening on different threads. 
-each time data comes over the first 8 bytes consist of the channel group and time sequence as two ints
-the rest is a NSAMPS_PER_BLOCK*NBEAMS_PER_TRANSMIT*NW char array that needs to be arranged correctly
-The output must be [NBEAMS_PER_BLOCK, NSAMPS_PER_BLOCK, NCHAN_FIL]. 
-
-After a block is full, the data need to be written out (data rate 525 Mb/s)
-The number of receives before switching blocks is NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT. 
-switch block when one block is being written out
-
-*/
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-int blockct = 0; // to count how many writes to block. max is NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NW
-int block_switch = 0; // 0 means write to output1, write out output2.
-int cores[16] = {3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28}; // to bind threads to
-char iP[100];
-
-// structure to pass to threads
-struct data
-{
-  char * output1;
-  char * output2;
-  uint16_t tport;
-  int thread_id;
-};
-
-// function prototypes
-void dsaX_dbgpu_cleanup (dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-
-// receive process - runs infinite loop
-void * process(void * ptr)
-{
-
-  // arguments from structure
-  struct data *d = ptr;
-  int thread_id = d->thread_id;
-  char *output1 = (char *)d->output1;
-  char *output2 = (char *)d->output2;
-  uint16_t tport = d->tport;
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
-
-  // set up socket
-  int sock = -1, conn = -1;
-  struct sockaddr_in address, cli;
-
-  /* create socket */
-  sock = socket(AF_INET, SOCK_STREAM, 0);
-  if (DEBUG) syslog(LOG_INFO,"thread %d: opened socket",thread_id);
-  memset(&address, 0, sizeof(struct sockaddr_in));
-  address.sin_family = AF_INET;
-  inet_pton(AF_INET, iP, &(address.sin_addr));
-  //address.sin_addr.s_addr = inet_addr("127.0.0.1");
-  address.sin_port = htons(tport);
-  if (DEBUG) syslog(LOG_INFO,"thread %d: socket ready",thread_id);
-  if (bind(sock, (struct sockaddr *)&address, sizeof(struct sockaddr_in)) < 0) {
-    syslog(LOG_ERR,"thread %d: cannot bind to port",thread_id);
-    exit(1);
-  }
-  if (DEBUG) syslog(LOG_INFO,"thread %d: socket bound",thread_id);
-  listen(sock, 5);
-  if (DEBUG) syslog(LOG_INFO,"thread %d: socket listening on port %d",thread_id,tport);
-  
-  // accept connection
-  socklen_t cli_len=sizeof(struct sockaddr);
-  conn = accept(sock, (struct sockaddr *) &cli, &cli_len);
-  if (conn<0) {
-    syslog(LOG_ERR,"thread %d: error accepting connection",thread_id);
-    exit(1);
-  }
-  syslog(LOG_INFO,"thread %d: accepted connection",thread_id);
-
-  // data buffer and other variables
-  char * buffer = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
-  char * dblock = (char *)malloc((8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW)*sizeof(char));
-  int *ibuf, chgroup, tseq, oidx, iidx;
-  int remain_data, outptr, len;
-  int i0;
-  
-  // infinite loop 
-  while (1) {
-  
-    /* read message */
-    // read to buffer until all is read
-    remain_data =(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW);
-    outptr=0;
-
-    /*
-    while (((len = recv(conn, dblock, remain_data, 0)) > 0) && (remain_data > 0)) {
-    memcpy(buffer+outptr, dblock, len);
-      remain_data -= len;
-      outptr += len;
-      //syslog(LOG_INFO,"Received %d of %d bytes",outptr,8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW);
-      }*/
-    //recvlen = read(sock, buffer, sizeof(buffer));
-    ibuf = (int *)(buffer);
-    len = recv(conn, dblock, remain_data, MSG_WAITALL);
-    memcpy(buffer, dblock, len);
-    remain_data -= len;
-    if (remain_data != 0)
-      syslog(LOG_ERR,"thread %d: only received %d of %d",thread_id,len,(int)(8+NSAMPS_PER_TRANSMIT*NBEAMS_PER_BLOCK*NW));
-    
-    if (remain_data==0) {
-    
-      // get channel group and time sequence
-      chgroup = ibuf[0]; // from 0-15
-      tseq = ibuf[1]; // continuous iterate over transmits
-      if (DEBUG) syslog(LOG_INFO,"thread %d: read message with chgroup %d tseq %d blockct %d",thread_id,chgroup,tseq,blockct);
-      tseq = (tseq * 128) % 4096; // place within output
-      
-      // output order is [beam, time, freq]. input order is [beam, time, freq], but only a subset of freqs
-      i0 = 8;
-      for (int i=0;i<NBEAMS_PER_BLOCK;i++) {
-	for (int j=0;j<NSAMPS_PER_TRANSMIT;j++) {	
-	  for (int k=0;k<NW;k++) {
-	    
-	    oidx = i*NSAMPS_PER_BLOCK*NCHAN_FIL + (tseq+j)*NCHAN_FIL + CHOFF/8 + chgroup*NW + k;
-	    //iidx = 8 + i0;
-	    
-	    if (block_switch==0) output1[oidx] = buffer[i0];
-	    if (block_switch==1) output2[oidx] = buffer[i0];
-
-	    i0++;
-	    
-	  }
-	}
-      }
-      
-      // iterate blockct
-      blockct++;
-
-    }
-
-  }
-
-  /* close socket and clean up */
-  close(sock);
-  free(buffer);
-  free(dblock);
-  pthread_exit(0);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_nicdb [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -f header file [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -o out_key [default BEAMCAPTURE_BLOCK_KEY]\n"
-	   " -i IP address\n"
-	   " -h print usage\n");
-}
-
-
-// main part of program 
-int main(int argc, char ** argv)
-{
-    
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_nicdb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t out_key = BEAMCAPTURE_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  char fnam[200];
-  
-  while ((arg=getopt(argc,argv,"c:f:o:i:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  // DADA stuff
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  // deal with headers
-  uint64_t header_size = 4096;
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  FILE *fin;
-  if (!(fin=fopen(fnam,"rb"))) {
-    syslog(LOG_ERR,"cannot open dada header file %s",fnam);
-    return EXIT_FAILURE;
-  }
-  fread(header_out, 4096, 1, fin);
-  fclose(fin);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }  
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_out);
-      return EXIT_FAILURE;
-    }
-    
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out);
-  uint64_t  bytes_read = 0;
-  char *output1, *output2;
-  output1 = (char *)malloc(sizeof(char)*block_out);
-  output2 = (char *)malloc(sizeof(char)*block_out);
-  memset(output1,0,block_out);
-  memset(output2,0,block_out);
-  uint64_t written, block_id;
-
-  // set up threads
-  
-  // set up data structure
-  for (int i=0; i<NCLIENTS; i++) {
-    args[i].output1 = output1;
-    args[i].output2 = output2;
-    args[i].thread_id = i;
-    args[i].tport = FIL_PORT0 + (uint16_t)(i);
-  }
-
-  if (DEBUG) syslog(LOG_INFO,"creating %d threads (one per client)",NCLIENTS);
-    
-  for(int i=0; i<NCLIENTS; i++){
-    if (pthread_create(&threads[i], &attr, &process, (void *)(&args[i]))) {
-      syslog(LOG_ERR,"Failed to create thread %d\n", i);
-    }
-  }
-  pthread_attr_destroy(&attr);
-  if (DEBUG) syslog(LOG_INFO,"threads kinda running");
-  
-  int observation_complete=0;
-  int blocks = 0;
-  int ctt;
-  int bswitch;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // look for complete block
-
-    //if (DEBUG) syslog(LOG_INFO,"here with %d",blockct);
-    usleep(10);
-
-    if (blockct>=NCLIENTS*NSAMPS_PER_BLOCK/NSAMPS_PER_TRANSMIT) {      
-      
-      // change output
-      bswitch= block_switch;
-      blockct=0;
-      if (bswitch==0) block_switch=1;
-      if (bswitch==1) block_switch=0;
-
-      // write to output
-      if (bswitch==0) written = ipcio_write (hdu_out->data_block, output1, block_out);
-      if (bswitch==1) written = ipcio_write (hdu_out->data_block, output2, block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");	
-	  dsaX_dbgpu_cleanup (hdu_out);
-	  return EXIT_FAILURE;
-	}
-
-      if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
-      blocks++;
-      ctt=0;
-    }
-      
-  }
-  
-  // free stuff
-  for(int i=0; i<NCLIENTS; i++){
-    pthread_join(threads[i], &result);
-    if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
-  }
-  free(output1);
-  free(output2);
-  dsaX_dbgpu_cleanup(hdu_out);
-  
-}
diff --git a/src/dsaX_reorder.c b/src/dsaX_reorder.c
deleted file mode 100644
index 04955da..0000000
--- a/src/dsaX_reorder.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  int n_threads;
-  int thread_id;
-  int debug;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int dbg = d->debug;
-   
-  // masks for fluffing
-  __m512i masks[4];
-  masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL);
-  masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL);
-  masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL);
-  masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL);
-
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-
-  /* DO ALL PROCESSING
-   
-     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
-     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
-     parallelize by splitting on NPACKETS axis. 
-
-   */
-
-  // input and output index and extracted data
-  int idx = thread_id; // PACKET idx for input and output
-  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
-  char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
-  
-  // extract data
-  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
-  if (DEBUG || dbg) syslog(LOG_DEBUG,"thread %d: extracted data",thread_id);
-  
-  // do fluffing
-
-  /* 
-     technique is to use nybble masks to 
-     (a) unmask every fourth nybble
-     (b) bit shift to left using mm512_slli_epi16
-     (c) sign extend by 4 bits using mm512_srai_epi16
-     (d) bit shift to right
-
-     Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data
-
-   */
-
-  // variables
-  char * low = (char *)malloc(sizeof(char)*64); // m512
-  char * hi = (char *)malloc(sizeof(char)*64); // m512
-  __m512i low_m, hi_m;
-  unsigned short * low_u = (unsigned short *)(low);
-  unsigned short * hi_u = (unsigned short *)(hi);
-  __m512i v[4]; // for 4 packed 4-bit numbers
-
-  // input and output
-  __m512i proc_m;
-  unsigned short * fluffed_u = (unsigned short *)(fluffed_data);
-
-  // numbers to iterate over
-  int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64;
-
-  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: ready to fluff",thread_id);
-  
-  // let's do it!
-  for (int i=0;i<n_512;i++) { // loop over lots of 512 bits
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: beginning fluff %d",thread_id,i);
-
-    // get input data
-    proc_m = _mm512_loadu_si512((proc_data+i*64));
-    if (dbg) syslog(LOG_DEBUG,"thread %d: copied data %d",thread_id,i);
-    
-    // retrieve masks
-    for (int j=0;j<4;j++) {
-      v[j] = _mm512_and_si512(proc_m, masks[j]);
-    }
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: masked %d",thread_id,i);
-    
-    // do in place fluffing
-    v[0] = _mm512_slli_epi16(v[0], 12);
-    v[0] = _mm512_srai_epi16(v[0], 4);
-    v[0] = _mm512_srli_epi16(v[0], 8);
-
-    v[1] = _mm512_slli_epi16(v[1], 8);
-    v[1] = _mm512_srai_epi16(v[1], 4);
-
-    v[2] = _mm512_slli_epi16(v[2], 4);
-    v[2] = _mm512_srai_epi16(v[2], 4);
-    v[2] = _mm512_srli_epi16(v[2], 8);
-
-    v[3] = _mm512_srai_epi16(v[3], 4);
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: in place %d",thread_id,i);
-
-    // make lower and upper 
-    low_m = _mm512_or_si512(v[0], v[1]);
-    hi_m = _mm512_or_si512(v[2], v[3]);
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: lower and upper %d",thread_id,i);
-
-    // copy back to bytes
-    _mm512_storeu_si512((__m512i *) &low[0], low_m);
-    _mm512_storeu_si512((__m512i *) &hi[0], hi_m);
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: copied lower and upper %d",thread_id,i);
-    
-    // extract from lower and upper into fluffed
-    // there are 32 2-byte unsigned shorts in each of low and hi
-    for (int j=0;j<32;j++) {
-      fluffed_u[i*64+j*2] = low_u[j];
-      fluffed_u[i*64+j*2+1] = hi_u[j];
-    }
-
-    if (dbg) syslog(LOG_DEBUG,"thread %d: extracted %d",thread_id,i);
-    
-  }
-
-  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: fluffed",thread_id);
-
-  memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*NANTS*2*2,fluffed_data,(NPACKETS/nthreads)*(384*2)*NANTS*2*2);
-  
-  if (dbg || DEBUG) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id);
-  
-  // free stuff
-  free(proc_data);
-  free(fluffed_data);
-  free(low);
-  free(hi);
-  
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      return EXIT_FAILURE;
-    }
-
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = block;
-      args[i].out = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].debug = 0;
-    }
-
-    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-    
-    // write to output
-
-    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    	
-    
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  
-}
-
-
diff --git a/src/dsaX_reorder_raw.c b/src/dsaX_reorder_raw.c
deleted file mode 100644
index c0f6b0c..0000000
--- a/src/dsaX_reorder_raw.c
+++ /dev/null
@@ -1,613 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-// Forward declaration to keep compiler happy
-// Possible minor bug in PSRDada
-int ipcio_check_pending_sod (ipcio_t* );
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  int n_threads;
-  int thread_id;
-  int debug;
-  int write;
-  ipcio_t * ipc;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -b connect to bf hdu\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int na = 64; // output ants
-  int dbg = d->debug;
-     
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-
-  /* DO ALL PROCESSING
-   
-     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
-     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
-     parallelize by splitting on NPACKETS axis. 
-
-   */
-
-  // input and output index and extracted data
-  int idx = thread_id; // PACKET idx for input and output
-  //char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
-  //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
-  //char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
-  
-  // extract data
-  //memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
-  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
-  
-  // do fluffing in dumbest possible way
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
-  
-  // let's do it!
-  int in_idx, out_idx, a1, a2, a3, a4, a5, a6;
-  int in_offset = idx*(NPACKETS/nthreads)*NANTS*(384*2)*2;
-  int out_offset = idx*(NPACKETS/nthreads)*(384*2)*na*2;
-  for (int i=0;i<(NPACKETS/nthreads);i++) {
-    a1 = i*NANTS*1536;
-    a2 = i*na*1536;
-    for (int j=0;j<NANTS;j++) {
-      for (int k=0;k<768;k++) {
-	for (int l=0;l<2;l++) {
-
-	  in_idx = a1+j*1536+k*2+l;
-	  out_idx = a2+k*na*2+j*2+l;
-
-	  d->ipc->curbuf[out_offset+out_idx] = in[in_offset+in_idx];
-	  //d->ipc->curbuf[out_offset+2*out_idx+1] = in[in_offset+in_idx] >> 4;
-
-	}
-      }
-    }
-  }
-  
-  /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data
-
-    fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4;
-    fluffed_data[2*i+1] = proc_data[i] >> 4;
-    
-    }*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
-  
-  // transpose antennas and frequencies by ints
-  // from fluffed_data to out_data
-  /* int * fluffed_int = (int *)(fluffed_data);
-  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  int * out_int = (int *)out_data;*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
-
-  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
-  /*  int tile_size = 3; // set by benchmarking
-  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
-
-    for (int i=0;i<NANTS;i+=tile_size) {
-      for (int j=0;j<384*2;j++) {
-	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
-      }
-    }
-
-    }*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
-  
-  // place in out
-  /*  if (d->write)
-    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  else
-    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  */
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
-  
-  // free stuff
-  //free(proc_data);
-  //free(fluffed_data);
-  //free(out_data);
-  
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-
-  // run test with single thread
-
-  /*syslog(LOG_INFO,"Running TEST...\n");
-  
-  // set up data structure
-  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
-  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
-  memset(test_block,0,sizeof(test_block));
-  
-   TEST CODE 
-  FILE *fin;
-  fin=fopen("../utils/packet.out","rb");
-  fread(test_block, 96768, 1, fin);
-  fclose(fin);
-   END TEST CODE 
-  
-  args[0].in = test_block;
-  args[0].out = test_output;
-  args[0].n_threads = 1;
-  args[0].thread_id = 0;
-  args[0].debug = 0;
-  args[0].write = 0;
-
-  // run test thread
-  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
-    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
-  }
-  else
-    syslog(LOG_INFO,"Created TEST thread\n");
-  pthread_attr_destroy(&attr);    
-  pthread_join(threads[0], &result);
-  syslog(LOG_INFO,"joined TEST thread");
-
-   TEST CODE 
-  fin=fopen("../utils/test.out","wb");
-  fwrite(test_output, 1, 196608, fin);
-  fclose(fin);
-  END TEST CODE 
-  
-  // clean up
-  free(test_block);
-  free(test_output);
-
-  syslog(LOG_INFO,"TEST COMPLETE");*/
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (bf) {
-    hdu_out2  = dada_hdu_create (0);
-    dada_hdu_set_key (hdu_out2, out_key2);
-    if (dada_hdu_connect (hdu_out2) < 0) {
-      syslog (LOG_ERR,"could not connect to output  buffer2");
-      return EXIT_FAILURE;
-    }
-    if (dada_hdu_lock_write(hdu_out2) < 0) {
-      syslog (LOG_ERR, "could not lock to output buffer2");
-      return EXIT_FAILURE;
-    }
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * blockie;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // sort out write
-    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 0;      
-    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
-    
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = block;
-      args[i].out = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].debug = 0;
-      args[i].ipc = hdu_out->data_block;
-      args[i].write = 1;
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
-    }
-    
-    // write to output
-
-    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    
-    if (bf) {
-
-      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	  dsaX_dbgpu_cleanup (hdu_in,0);
-	  dsaX_dbgpu_cleanup (hdu_out,1);
-	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	  return EXIT_FAILURE;
-	}
-
-    }
-
-    // finish write
-    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
-    ipcio_check_pending_sod (hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 1;      
-    //ipcio_close_block_write(hdu_out->data_block, block_out);
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-  
-}
-
-
diff --git a/src/dsaX_reorder_raw.c.bak b/src/dsaX_reorder_raw.c.bak
deleted file mode 100644
index 0914823..0000000
--- a/src/dsaX_reorder_raw.c.bak
+++ /dev/null
@@ -1,672 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  int n_threads;
-  int thread_id;
-  int debug;
-  int write;
-  ipcio_t * ipc;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -b connect to bf hdu\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int na = 64; // output ants
-  int dbg = d->debug;
-   
-  // masks for fluffing
-  __m512i masks[4];
-  masks[0] = _mm512_set_epi64(0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL);
-  masks[1] = _mm512_set_epi64(0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL, 0x00f000f000f000f0ULL);
-  masks[2] = _mm512_set_epi64(0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL, 0x0f000f000f000f00ULL);
-  masks[3] = _mm512_set_epi64(0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL, 0xf000f000f000f000ULL);
-
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-
-  /* DO ALL PROCESSING
-   
-     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
-     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
-     parallelize by splitting on NPACKETS axis. 
-
-   */
-
-  // input and output index and extracted data
-  int idx = thread_id; // PACKET idx for input and output
-  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
-  char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
-  char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
-  
-  // extract data
-  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
-  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
-  
-  // do fluffing
-
-  /* 
-     technique is to use nybble masks to 
-     (a) unmask every fourth nybble
-     (b) bit shift to left using mm512_slli_epi16
-     (c) sign extend by 4 bits using mm512_srai_epi16
-     (d) bit shift to right
-
-     Will produce m512 for lower and upper bytes. Then just need to copy into fluffed_data
-
-   */
-
-  // variables
-  char * low = (char *)malloc(sizeof(char)*64); // m512
-  char * hi = (char *)malloc(sizeof(char)*64); // m512
-  __m512i low_m, hi_m;
-  unsigned short * low_u = (unsigned short *)(low);
-  unsigned short * hi_u = (unsigned short *)(hi);
-  __m512i v[4]; // for 4 packed 4-bit numbers
-
-  // input and output
-  __m512i proc_m;
-  unsigned short * fluffed_u = (unsigned short *)(fluffed_data);
-
-  // numbers to iterate over
-  int n_512 = (NPACKETS/nthreads)*NANTS*(384*2)*2/64;
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
-  
-  // let's do it!
-  for (int i=0;i<n_512;i++) { // loop over lots of 512 bits
-
-    if (dbg) syslog(LOG_INFO,"thread %d: beginning fluff %d",thread_id,i);
-
-    // get input data
-    proc_m = _mm512_loadu_si512((proc_data+i*64));
-    if (dbg) syslog(LOG_INFO,"thread %d: copied data %d",thread_id,i);
-    
-    // retrieve masks
-    for (int j=0;j<4;j++) {
-      v[j] = _mm512_and_si512(proc_m, masks[j]);
-    }
-
-    if (dbg) syslog(LOG_INFO,"thread %d: masked %d",thread_id,i);
-    
-    // do in place fluffing
-    v[0] = _mm512_slli_epi16(v[0], 12);
-    v[0] = _mm512_srai_epi16(v[0], 4);
-    v[0] = _mm512_srli_epi16(v[0], 8);
-
-    v[1] = _mm512_slli_epi16(v[1], 8);
-    v[1] = _mm512_srai_epi16(v[1], 4);
-
-    v[2] = _mm512_slli_epi16(v[2], 4);
-    v[2] = _mm512_srai_epi16(v[2], 4);
-    v[2] = _mm512_srli_epi16(v[2], 8);
-
-    v[3] = _mm512_srai_epi16(v[3], 4);
-
-    if (dbg) syslog(LOG_INFO,"thread %d: in place %d",thread_id,i);
-
-    // make lower and upper 
-    low_m = _mm512_or_si512(v[0], v[1]);
-    hi_m = _mm512_or_si512(v[2], v[3]);
-
-    if (dbg) syslog(LOG_INFO,"thread %d: lower and upper %d",thread_id,i);
-
-    // copy back to bytes
-    _mm512_storeu_si512((__m512i *) &low[0], low_m);
-    _mm512_storeu_si512((__m512i *) &hi[0], hi_m);
-
-    if (dbg) syslog(LOG_INFO,"thread %d: copied lower and upper %d",thread_id,i);
-    
-    // extract from lower and upper into fluffed
-    // there are 32 2-byte unsigned shorts in each of low and hi
-    for (int j=0;j<32;j++) {
-      fluffed_u[i*64+j*2] = low_u[j];
-      fluffed_u[i*64+j*2+1] = hi_u[j];
-    }
-
-    if (dbg) syslog(LOG_INFO,"thread %d: extracted %d",thread_id,i);
-    
-  }
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
-  
-  // transpose antennas and frequencies by ints
-  // from fluffed_data to out_data
-  int * fluffed_int = (int *)(fluffed_data);
-  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  int * out_int = (int *)out_data;
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
-
-  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
-  int tile_size = 7; // set by benchmarking
-  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
-
-    for (int i=0;i<NANTS;i+=tile_size) {
-      for (int j=0;j<384*2;j++) {
-	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
-      }
-    }
-
-  }
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
-  
-  // place in out
-  if (d->write)
-    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  else
-    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
-  
-  // free stuff
-  free(proc_data);
-  free(fluffed_data);
-  free(out_data);
-  free(low);
-  free(hi);
-  
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-
-  // run test with single thread
-
-  syslog(LOG_INFO,"Running TEST...\n");
-  
-  // set up data structure
-  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
-  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
-  memset(test_block,0,sizeof(test_block));
-  
-  /* TEST CODE 
-  FILE *fin;
-  fin=fopen("../utils/packet.out","rb");
-  fread(test_block, 96768, 1, fin);
-  fclose(fin);
-   END TEST CODE */
-  
-  args[0].in = test_block;
-  args[0].out = test_output;
-  args[0].n_threads = 1;
-  args[0].thread_id = 0;
-  args[0].debug = 0;
-  args[0].write = 0;
-
-  // run test thread
-  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
-    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
-  }
-  else
-    syslog(LOG_INFO,"Created TEST thread\n");
-  pthread_attr_destroy(&attr);    
-  pthread_join(threads[0], &result);
-  syslog(LOG_INFO,"joined TEST thread");
-
-  /* TEST CODE 
-  fin=fopen("../utils/test.out","wb");
-  fwrite(test_output, 1, 196608, fin);
-  fclose(fin);
-  END TEST CODE */
-  
-  // clean up
-  free(test_block);
-  free(test_output);
-
-  syslog(LOG_INFO,"TEST COMPLETE");
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (bf) {
-    hdu_out2  = dada_hdu_create ();
-    dada_hdu_set_key (hdu_out2, out_key2);
-    if (dada_hdu_connect (hdu_out2) < 0) {
-      syslog (LOG_ERR,"could not connect to output  buffer2");
-      return EXIT_FAILURE;
-    }
-    if (dada_hdu_lock_write(hdu_out2) < 0) {
-      syslog (LOG_ERR, "could not lock to output buffer2");
-      return EXIT_FAILURE;
-    }
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * blockie;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // sort out write
-    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 0;      
-    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
-    
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = block;
-      args[i].out = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].debug = 0;
-      args[i].ipc = hdu_out->data_block;
-      args[i].write = 1;
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
-    }
-    
-    // write to output
-
-    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    
-    if (bf) {
-
-      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	  dsaX_dbgpu_cleanup (hdu_in,0);
-	  dsaX_dbgpu_cleanup (hdu_out,1);
-	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	  return EXIT_FAILURE;
-	}
-
-    }
-
-    // finish write
-    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
-    ipcio_check_pending_sod (hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 1;      
-    //ipcio_close_block_write(hdu_out->data_block, block_out);
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-  
-}
-
-
diff --git a/src/dsaX_reorder_raw.c.bak2 b/src/dsaX_reorder_raw.c.bak2
deleted file mode 100644
index 54ad886..0000000
--- a/src/dsaX_reorder_raw.c.bak2
+++ /dev/null
@@ -1,608 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  int n_threads;
-  int thread_id;
-  int debug;
-  int write;
-  ipcio_t * ipc;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -b connect to bf hdu\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-  int na = 64; // output ants
-  int dbg = d->debug;
-     
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-
-  /* DO ALL PROCESSING
-   
-     "in" is input block: NPACKETS * NANTS * (384*2) * 2 pol * r/i. (384*2 is for the two times)
-     "out" needs to be in order NPACKETS * (384*2) * 64 * 2 pol * r/i
-     parallelize by splitting on NPACKETS axis. 
-
-   */
-
-  // input and output index and extracted data
-  int idx = thread_id; // PACKET idx for input and output
-  char * proc_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2); // for 4-bit data
-  //char * fluffed_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*NANTS*(384*2)*2*2); // for 8-bit data
-  char * out_data = (char *)malloc(sizeof(char)*(NPACKETS/nthreads)*(384*2)*na*2*2); // for output 8-bit data
-  
-  // extract data
-  memcpy(proc_data,in+idx*(NPACKETS/nthreads)*NANTS*(384*2)*2,(NPACKETS/nthreads)*NANTS*(384*2)*2);
-  if (DEBUG || dbg) syslog(LOG_INFO,"thread %d: extracted data",thread_id);
-  
-  // do fluffing in dumbest possible way
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to fluff",thread_id);
-  
-  // let's do it!
-  int in_idx, out_idx, a1, a2, a3, a4, a5, a6;
-  for (int i=0;i<(NPACKETS/nthreads);i++) {
-    a1 = i*NANTS*1536;
-    a2 = i*na*1536;
-    for (int j=0;j<NANTS;j++) {
-      for (int k=0;k<768;k++) {
-	for (int l=0;l<2;l++) {
-
-	  in_idx = a1+j*1536+k*2+l;
-	  out_idx = a2+k*na*2+j*2+l;
-
-	  out_data[2*out_idx] = ((proc_data[in_idx]<<4) & 240) >> 4;
-	  out_data[2*out_idx+1] = proc_data[in_idx] >> 4;
-
-	}
-      }
-    }
-  }
-  
-  /*for (int i=0;i<(NPACKETS/nthreads)*NANTS*(384*2)*2;i++) { // loop over chars in proc_data
-
-    fluffed_data[2*i] = ((proc_data[i]<<4) & 240) >> 4;
-    fluffed_data[2*i+1] = proc_data[i] >> 4;
-    
-    }*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: fluffed",thread_id);
-  
-  // transpose antennas and frequencies by ints
-  // from fluffed_data to out_data
-  /* int * fluffed_int = (int *)(fluffed_data);
-  memset(out_data,0,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  int * out_int = (int *)out_data;*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: ready to transpose",thread_id);
-
-  // do block transpose - https://codereview.stackexchange.com/questions/229876/fast-matrix-transpose
-  /*  int tile_size = 3; // set by benchmarking
-  for (int i_packet=0;i_packet<NPACKETS/nthreads;i_packet++) {
-
-    for (int i=0;i<NANTS;i+=tile_size) {
-      for (int j=0;j<384*2;j++) {
-	for (int b=0;b<tile_size;b++) out_int[i_packet*na*768 + j*na+i+b] = fluffed_int[i_packet*NANTS*768 + (i+b)*384*2+j];
-      }
-    }
-
-    }*/
-
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: transposed",thread_id);
-  
-  // place in out
-  if (d->write)
-    memcpy (d->ipc->curbuf + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  else
-    memcpy(out + idx*(NPACKETS/nthreads)*(384*2)*na*2*2,out_data,(NPACKETS/nthreads)*(384*2)*na*2*2);
-  
-  if (dbg || DEBUG) syslog(LOG_INFO,"thread %d: done - freeing",thread_id);
-  
-  // free stuff
-  free(proc_data);
-  //free(fluffed_data);
-  free(out_data);
-  
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_reorder_raw", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-
-  // run test with single thread
-
-  syslog(LOG_INFO,"Running TEST...\n");
-  
-  // set up data structure
-  char * test_block = (char *)malloc(sizeof(char)*NPACKETS*NANTS*(384*2)*2);
-  char * test_output = (char *)malloc(sizeof(char)*NPACKETS*64*(384*2)*2*2);
-  memset(test_block,0,sizeof(test_block));
-  
-  /* TEST CODE 
-  FILE *fin;
-  fin=fopen("../utils/packet.out","rb");
-  fread(test_block, 96768, 1, fin);
-  fclose(fin);
-   END TEST CODE */
-  
-  args[0].in = test_block;
-  args[0].out = test_output;
-  args[0].n_threads = 1;
-  args[0].thread_id = 0;
-  args[0].debug = 0;
-  args[0].write = 0;
-
-  // run test thread
-  if (pthread_create(&threads[0], &attr, &massage, (void *)(&args[0]))) {
-    syslog(LOG_ERR,"Failed to create TEST massage thread 0\n");
-  }
-  else
-    syslog(LOG_INFO,"Created TEST thread\n");
-  pthread_attr_destroy(&attr);    
-  pthread_join(threads[0], &result);
-  syslog(LOG_INFO,"joined TEST thread");
-
-  /* TEST CODE 
-  fin=fopen("../utils/test.out","wb");
-  fwrite(test_output, 1, 196608, fin);
-  fclose(fin);
-  END TEST CODE */
-  
-  // clean up
-  free(test_block);
-  free(test_output);
-
-  syslog(LOG_INFO,"TEST COMPLETE");
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (bf) {
-    hdu_out2  = dada_hdu_create ();
-    dada_hdu_set_key (hdu_out2, out_key2);
-    if (dada_hdu_connect (hdu_out2) < 0) {
-      syslog (LOG_ERR,"could not connect to output  buffer2");
-      return EXIT_FAILURE;
-    }
-    if (dada_hdu_lock_write(hdu_out2) < 0) {
-      syslog (LOG_ERR, "could not lock to output buffer2");
-      return EXIT_FAILURE;
-    }
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * blockie;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // sort out write
-    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 0;      
-    //blockie = ipcio_open_block_write(hdu_out->data_block, &block_id);
-    
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = block;
-      args[i].out = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].debug = 0;
-      args[i].ipc = hdu_out->data_block;
-      args[i].write = 1;
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_INFO,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_INFO,"joined thread %d",i);
-    }
-    
-    // write to output
-
-    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    
-    if (bf) {
-
-      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	  dsaX_dbgpu_cleanup (hdu_in,0);
-	  dsaX_dbgpu_cleanup (hdu_out,1);
-	  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	  return EXIT_FAILURE;
-	}
-
-    }
-
-    // finish write
-    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
-    ipcio_check_pending_sod (hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 1;      
-    //ipcio_close_block_write(hdu_out->data_block, block_out);
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-  
-}
-
-
diff --git a/src/dsaX_simplesplit.c b/src/dsaX_simplesplit.c
deleted file mode 100644
index 7a80c7e..0000000
--- a/src/dsaX_simplesplit.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int DEBUG = 0;
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_split [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -b connect to bf hdu\n"
-	   " -i in_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -o out_key [default CAPTURED_BLOCK_KEY]\n"
-	   " -j out_key2 [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_simplesplit", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURE_BLOCK_KEY;
-  key_t out_key = CAPTURED_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:j:dbh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key2) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (bf) {
-    hdu_out2  = dada_hdu_create (0);
-    dada_hdu_set_key (hdu_out2, out_key2);
-    if (dada_hdu_connect (hdu_out2) < 0) {
-      syslog (LOG_ERR,"could not connect to output  buffer2");
-      return EXIT_FAILURE;
-    }
-    if (dada_hdu_lock_write(hdu_out2) < 0) {
-      syslog (LOG_ERR, "could not lock to output buffer2");
-      return EXIT_FAILURE;
-    }
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * o1, * o2;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  char * output = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    
-    // DO STUFF
-
-        
-    // copy to output buffer
-    memcpy(output_buffer, block, block_size);      
-
-    // do write
-    written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    if (bf) 
-      written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
-    
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-  free(output);
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  
-}
-
-
diff --git a/src/dsaX_splice.c b/src/dsaX_splice.c
deleted file mode 100644
index b91e665..0000000
--- a/src/dsaX_splice.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/* This works pretty much like the trigger code. receives a control UDP message 
-to store some data for a fixed amount of time.
-Message format: length(s)-NAME
-Will ignore messages until data recording is over
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <arpa/inet.h>
-#include <sys/syscall.h>
-#include <syslog.h>
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <src/sigproc.h>
-#include <src/header.h>
-
-
-FILE *output;
-
-void send_string(char *string) /* includefile */
-{
-  int len;
-  len=strlen(string);
-  fwrite(&len, sizeof(int), 1, output);
-  fwrite(string, sizeof(char), len, output);
-}
-
-void send_float(char *name,float floating_point) /* includefile */
-{
-  send_string(name);
-  fwrite(&floating_point,sizeof(float),1,output);
-}
-
-void send_double (char *name, double double_precision) /* includefile */
-{
-  send_string(name);
-  fwrite(&double_precision,sizeof(double),1,output);
-}
-
-void send_int(char *name, int integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(int),1,output);
-}
-
-void send_char(char *name, char integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(char),1,output);
-}
-
-
-void send_long(char *name, long integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(long),1,output);
-}
-
-void send_coords(double raj, double dej, double az, double za) /*includefile*/
-{
-  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
-  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
-  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
-  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
-}
-
-
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-int trignum = 0;
-int dumpnum = 0;
-char iP[100];
-char srcnam[1024];
-float reclen;
-int DEBUG = 0;
-
-void usage()
-{
-  fprintf (stdout, "dsaX_splice [16 files]\n");
-}
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_splice", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // set up input array
-  // 16 corrs, 3840 times, 256 beams, 48 chans
-  char * bigarr = (char *)malloc(sizeof(char)*16*3840*256*48);
-  char foutnam[200];
-
-  // read into input array
-  FILE *fin;
-  for (int i=1;i<17;i++) {
-    fin=fopen(argv[i],"rb");
-    fread(bigarr+(i-1)*3840*256*48,3840*256*48,1,fin);
-    fclose(fin);
-  }
-
-  // reorder bigarr
-  char * tarr = (char *)malloc(sizeof(char)*16*3840*256*48);
-  int oidx, iidx;
-  // order is beam, time, freq
-  for (int i=0;i<16;i++) {
-    for (int j=0;j<3840;j++) {
-      for (int k=0;k<256;k++) {
-
-	iidx = i*3840*256*48 + j*256*48 + k*48;
-	oidx = k*3840*768 + j*768 + i*48;
-	memcpy(tarr + oidx, bigarr + iidx, 48);
-
-      }
-    }
-  }
-  free(bigarr);
-
-  // loop over beams and write out all filterbanks
-  for (int i=0;i<256;i++) {
-    
-    sprintf(foutnam,"/home/ubuntu/data/fb_%d.fil",i);    
-    
-    if (!(output = fopen(foutnam,"wb"))) {
-      printf("Couldn't open output file\n");
-      return 0;
-    }
-    
-    send_string("HEADER_START");
-    send_string("source_name");
-    sprintf(srcnam,"fb_%d",i);
-    send_string(srcnam);
-    send_int("machine_id",1);
-    send_int("telescope_id",82);
-    send_int("data_type",1); // filterbank data
-    send_double("fch1",1498.75); // THIS IS CHANNEL 0 :)
-    send_double("foff",-0.244140625);
-    send_int("nchans",768);
-    send_int("nbits",8);
-    send_double("tstart",55000.0);
-    send_double("tsamp",8.192e-6*8.*16.);
-    send_int("nifs",1);
-    send_string("HEADER_END");
-
-    fwrite(tarr + i*2949120,2949120,1,output);
-    fclose(output);
-
-  }
-
-  // write out full filterbank
-  sprintf(foutnam,"/home/ubuntu/data/fb_all.fil");    
-  
-  if (!(output = fopen(foutnam,"wb"))) {
-    printf("Couldn't open output file\n");
-    return 0;
-  }
-    
-  send_string("HEADER_START");
-  send_string("source_name");
-  sprintf(srcnam,"fb_all");
-  send_string(srcnam);
-  send_int("machine_id",1);
-  send_int("telescope_id",82);
-  send_int("data_type",1); // filterbank data
-  send_double("fch1",1498.75); // THIS IS CHANNEL 0 :)
-  send_double("foff",-0.244140625);
-  send_int("nchans",768);
-  send_int("nbits",8);
-  send_double("tstart",55000.0);
-  send_double("tsamp",8.192e-6*8.*16.);
-  send_int("nifs",1);
-  send_string("HEADER_END");
-  
-  fwrite(tarr,16*3840*256*48,1,output);
-  fclose(output);
-
-  
-  free(tarr);
-  
-}
diff --git a/src/dsaX_split.c b/src/dsaX_split.c
deleted file mode 100644
index 1361e86..0000000
--- a/src/dsaX_split.c
+++ /dev/null
@@ -1,601 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-/* global variables */
-int DEBUG = 0;
-int STATS = 0;
-const int nth = 4;
-
-// data to pass to threads
-struct data {
-  char * in;
-  char * out;
-  char * out2;
-  int bf;
-  int reorder;
-  int n_threads;
-  int thread_id;
-};
-int cores[8] = {10, 11, 12, 13, 14, 15, 16, 17};
-
-
-void * massage (void *args) {
-
-  struct data *d = args;
-  int thread_id = d->thread_id;
-
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-
-  // extract from input
-  char *in = (char *)d->in;
-  int bf = d->bf;
-  int reorder = d->reorder;
-  int n_threads = d->n_threads;  
-  
-  if (!reorder) {
-    memcpy(d->out + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT);
-    if (bf)
-      memcpy(d->out2 + thread_id*(2048/n_threads)*1536*NANT, in + thread_id*(2048/n_threads)*1536*NANT, (2048/n_threads)*1536*NANT);
-  }
-  else {
-  
-    // block for transpose
-    int block = 16;
-  
-    for (int i=(int)(thread_id*(2048/n_threads));i<(int)((thread_id + 1)*2048/n_threads);i++) { // over time
-      for (int i1 = 0; i1 < 48; i1 += block) {
-	for(int j = 0; j < NANT; j++) {
-	  for(int b = 0; b < block && i1 + b < 48; b++) {
-	    memcpy(d->out + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32);
-	    if (bf) memcpy(d->out2 + i*1536*NANT + (i1+b)*NANT*32 + j*32, in + i*1536*NANT + j*1536 + (i1+b)*32, 32);
-	  }
-	}
-      }
-    }    
-
-  }
-    
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-void reorder_block(char *block, char *output);
-void calc_stats(char *block);
-
-// calculates rms for each pol from the first packet in each block. 
-// block has shape [2048 time, NANT antennas, 768 channels, 2 pol, r/i]
-void calc_stats(char *input) {
-
-  float rmss[NANT*2];
-  int iidx;
-  for (int i=0;i<NANT*2;i++) rmss[i] = 0.;
-
-  for (int ant=0;ant<NANT;ant++) {
-    for (int chan=0;chan<768;chan++) {
-      for (int pol=0;pol<2;pol++) {
-
-	iidx = ant*1536+chan*2+pol;
-	
-	rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 15) << 4)) >> 4),2.);
-	rmss[ant*2+pol] += pow((float)(((char)((input[iidx] & 240))) >> 4),2.);
-
-      }
-    }
-  }
-
-  for (int i=0;i<NANT;i++) {
-    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.0),sqrt(rmss[2*i+1]/768.0));
-  }
-
-}
-
-// performs cpu reorder of block to be loaded to GPU
-void reorder_block(char * block, char * output) {
-
-  // from [2048 time, NANT antennas, 48 channels, 16 chunnels, 2 pol, r/i]
-  // to [2048 time, 48 channels, NANT antennas, 16 chunnels, 2 pol, r/i]
-  // 24576*NANT in total. 1536*NANT per time
-  
-  for (int i=0;i<2048;i++) { // over time
-    for (int k=0;k<48;k++) { // over channels
-      for (int j=0;j<NANT;j++) { // over ants
-	// copy 32 bytes
-	memcpy(output + i*1536*NANT + k*NANT*32 + j*32, block + i*1536*NANT + j*1536 + k*32, 32); 
-	
-      }
-    }
-  }
-
-  //memcpy(block,output,24576*NANT);
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_split [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -m multithread write\n"
-	   " -b connect to bf hdu\n"
-	   " -r reorder\n"
-	   " -i in_key [default CAPTURE_BLOCK_KEY]\n"
-	   " -o out_key [default CAPTURED_BLOCK_KEY]\n"
-	   " -j out_key2 [default REORDER_BLOCK_KEY2]\n"
-	   " -s stats\n"
-	   " -f send fake blocks through [default 0]\n"
-	   " -h print usage\n");
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_split", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURE_BLOCK_KEY;
-  key_t out_key = CAPTURED_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int bf = 0;
-  int arg = 0;
-  int reorder = 0;
-  int mwrite = 0;
-  int fake = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:j:f:smdbrh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      fake = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key2) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'r':
-	  reorder=1;
-	  syslog (LOG_INFO, "Will do reorder");
-	  break;
-	case 'm':
-	  mwrite=1;
-	  syslog (LOG_INFO, "Will do multithread write");
-	  break;
-	case 's':
-	  STATS=1;
-	  syslog (LOG_INFO, "Will print stats");
-	  break;
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (bf) {
-    hdu_out2  = dada_hdu_create (0);
-    dada_hdu_set_key (hdu_out2, out_key2);
-    if (dada_hdu_connect (hdu_out2) < 0) {
-      syslog (LOG_ERR,"could not connect to output  buffer2");
-      return EXIT_FAILURE;
-    }
-    if (dada_hdu_lock_write(hdu_out2) < 0) {
-      syslog (LOG_ERR, "could not lock to output buffer2");
-      return EXIT_FAILURE;
-    }
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  uint64_t nints = block_size / block_out;
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * o1, * o2;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  char * output = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // set up threads
-  struct data args[8];
-  pthread_t threads[8];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  // send through fake blocks
-
-  if (fake>0) {
-    syslog(LOG_INFO,"sending %d fake blocks",fake);
-    for (int i=0;i<fake;i++) {
-      o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
-      memcpy(o1, output, block_out);
-      ipcio_close_block_write (hdu_out->data_block, block_out);
-      usleep(10000);
-    }
-    syslog(LOG_INFO,"Finished with fake blocks");
-  }
-  
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    
-    // DO STUFF
-
-    for (int myint=0;myint<nints;myint++) {
-        
-      // copy to output buffer
-                  
-      memcpy(output_buffer, block + myint*block_out, block_out);      
-
-      if (mwrite) {
-	o1 = ipcio_open_block_write (hdu_out->data_block, &block_id);
-	if (bf) o2 = ipcio_open_block_write (hdu_out2->data_block, &block_id);
-      }
-      
-      // stats
-      if (STATS) calc_stats(output_buffer);
-      
-      //if (reorder) {
-      
-      // set up data structure
-      for (int i=0; i<nth; i++) {
-	args[i].in = output_buffer;
-	args[i].reorder = reorder;
-	args[i].bf = 0;
-	if (mwrite) {
-	  args[i].out = o1;	
-	  if (bf) {
-	    args[i].out2 = o2;
-	    args[i].bf = 1;
-	  }
-	}
-	else
-	  args[i].out = output;
-	args[i].n_threads = nth;
-	args[i].thread_id = i;
-      }
-      
-      //if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nth);
-      syslog(LOG_INFO, "creating threads");
-      
-      for(int i=0; i<nth; i++){
-	if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
-	  syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-	}
-      }
-      
-      pthread_attr_destroy(&attr);
-      if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-      
-      for(int i=0; i<nth; i++){
-	pthread_join(threads[i], &result);
-	if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-      }
-      
-      
-      if (!mwrite) {
-	if (reorder && (!bf))
-	  written = ipcio_write (hdu_out->data_block, output, block_out);
-	else 
-	  written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-	
-	if (bf) {
-	  written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-	  if (reorder)
-	    written = ipcio_write (hdu_out2->data_block, output, block_out);
-	  else
-	    written = ipcio_write (hdu_out2->data_block, output_buffer, block_out);
-	}
-      }
-      else {
-	ipcio_close_block_write (hdu_out->data_block, block_out);
-	if (bf) ipcio_close_block_write (hdu_out2->data_block, block_out);
-      }
-      
-      if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-      blocks++;
-      
-      
-      if (bytes_read < block_size)
-	observation_complete = 1;            
-      
-    }
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-  free(output);
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  
-}
-
-
diff --git a/src/dsaX_splitup.c b/src/dsaX_splitup.c
deleted file mode 100644
index 32f055d..0000000
--- a/src/dsaX_splitup.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_splitup", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  uint64_t nsplits = block_size/block_out;
-  char * block, * output_buffer;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // do multiple writes
-
-    for (uint64_t i=0;i<nsplits;i++) {
-
-      memcpy(output_buffer,block+i*block_out,block_out);
-
-      // write to output
-      written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	  return EXIT_FAILURE;
-	}
-      
-      if (DEBUG) {
-	syslog(LOG_DEBUG, "written block %d",blocks);      
-      }
-      blocks++;
-
-    }
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_store.c b/src/dsaX_store.c
deleted file mode 100644
index 849c27c..0000000
--- a/src/dsaX_store.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Code to read from a raw data buffer and write to disk */
-
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <syslog.h>
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in)
-{
-  
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_dbdisk [options]\n"
-	   " -c core   bind process to CPU core\n"
-	   " -k in_key [default fafa]\n"
-	   " -h print usage\n");
-}
-
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_store", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  
-  // input data block HDU key
-  key_t in_key = 0x0000fafa;
-
-  // command line arguments
-  uint64_t blocksize;
-  uint64_t bout = 32*NSNAPS*4608; // output block size - assume input is a multiple.
-  int core = -1;
-  int arg=0;
-
-  while ((arg=getopt(argc,argv,"c:k:h")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      printf ("ERROR: -c flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'k':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-k flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // DADA stuff
-
-  // open connection to the in/read DB
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to input buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"dsaX_correlator_copy: could not lock to input buffer");
-    return EXIT_FAILURE;
-  }
-  
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      syslog(LOG_INFO,"binding to core %d", core);
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"dsaX_correlator_copy: failed to bind to core %d",core);
-    }
-  
-  // more DADA stuff - deal with headers
-  
-  uint64_t header_size = 0;
-
-  // read the header from the input HDU
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "main: could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  
-  // mark the input header as cleared
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared [input]");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  int observation_complete=0;
-
-  // stuff for writing data
-  blocksize = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  char * cpbuf = (char *)malloc(sizeof(char)*blocksize);
-  char * outbuf = (char *)malloc(sizeof(char)*bout);
-  int ngulps = (int)(blocksize/bout);
-  int gulp = 0, wseq = 0;;
-  char *in_data;
-  uint64_t written=0, written2=0;
-  uint64_t block_id, bytes_read=0;
-  FILE *fout;
-  char fnam[100];
-  
-
-  syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout);
-
-  
-  // main reading loop
-
-  syslog(LOG_INFO, "main: starting read");
-
-  while (!observation_complete) {
-
-    // read a DADA block
-    in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    // copy
-    memcpy(cpbuf, in_data, blocksize);
-    syslog(LOG_INFO, "starting new write (seq %d)",wseq);
-
-    // open file for writing
-    sprintf(fnam,"/home/ubuntu/data/fl_%d.out",wseq);
-    fout = fopen(fnam,"wb");
-    for (gulp=0;gulp<ngulps;gulp++) {
-
-      // copy to outbuf
-      memcpy(outbuf, cpbuf+gulp*bout, bout);
-
-      // write
-      usleep(40000);
-      fwrite(outbuf, 1, bout, fout);
-
-    }
-    fclose(fout);
-    wseq++;
-    syslog(LOG_INFO, "main: finished new write to file %s",fnam);
-    
-    // for exiting
-    if (bytes_read < blocksize) {
-      observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize);
-    }
-
-    // close block for reading
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-  
-  free(cpbuf);
-  free(outbuf);
-  dsaX_dbgpu_cleanup (hdu_in);
-  
-}
-  
diff --git a/src/dsaX_testdada.c b/src/dsaX_testdada.c
deleted file mode 100644
index bbe7640..0000000
--- a/src/dsaX_testdada.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-#include "xgpu.h"
-
-// print fn
-void print_arr(char *ptr, int len) {
-  printf("\n[");
-  for (int i = 0; i < len; i++) {
-    printf(" %08x,", ptr[i]);
-  }
-  printf(" ]\n");
-}
-
-// read and write functions
-
-int write_block(dada_hdu_t* hdu_in) {
-
-  dada_hdu_lock_write(hdu_in);
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  char * data = (char *)malloc(sizeof(char)*block_size);
-  memset(data, 0, block_size);
-  ipcio_write (hdu_in->data_block, data, block_size);
-  free(data);
-  dada_hdu_unlock_write (hdu_in);
-  
-}
-
-int read_block(dada_hdu_t* hdu_in) {
-
-  dada_hdu_lock_read(hdu_in);
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  char * data = (char *)malloc(sizeof(char)*block_size);
-  char * block;
-  uint64_t  bytes_read, block_id;
-  
-  block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-  memcpy(data, block, bytes_read);
-  print_arr(data, (int)(bytes_read));
-  
-  free(data);
-  ipcio_close_block_read (hdu_in->data_block, bytes_read);
-  dada_hdu_unlock_read (hdu_in);
-  
-}
-
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  
-  // command line arguments
-  int arg = 0;
-  char *hout;
-  hout = (char *)malloc(sizeof(char)*4096);
-
-  
-  while ((arg=getopt(argc,argv,"i:h:")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      sscanf (optarg, "%x", &in_key);
-	      break;
-	    }
-	case 'h':
-	  if (optarg)
-	    {
-	      fileread (optarg, hout, 4096);
-	      break;
-	    }	 
-	}
-    }
-  
-  // DADA stuff  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  dada_hdu_connect (hdu_in);
-
-  /*
-  // deal with header
-  dada_hdu_lock_write(hdu_in);
-  char * header_out = ipcbuf_get_next_write (hdu_in->header_block);
-  memcpy (header_out, hout, 4096);
-  ipcbuf_mark_filled (hdu_in->header_block, 4096);
-  dada_hdu_unlock_write(hdu_in);
-  free(hout);
-
-  dada_hdu_lock_read(hdu_in);
-  uint64_t header_size = 0;
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  ipcbuf_mark_cleared (hdu_in->header_block);
-  dada_hdu_unlock_read(hdu_in);
-  */
-
-  // do four reads and four writes
-
-  while (1) {
-  
-    printf("writing four blocks... ");
-    for (int i=0;i<4;i++) {
-      write_block(hdu_in);
-      sleep(0.5);
-    }
-    printf("written\n");
-    
-    sleep(2);
-    
-    printf("reading four blocks... ");
-    for (int i=0;i<4;i++) {
-      read_block(hdu_in);
-      sleep(0.5);
-    }
-    printf("read\n");
-    
-  }
-  
-}
-
-
diff --git a/src/dsaX_trigger.c b/src/dsaX_trigger.c
deleted file mode 100644
index 9592389..0000000
--- a/src/dsaX_trigger.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Code to read from a single dada buffer, and write to disk upon receiving
-a trigger. Uses pthread threads and shared memory to listen. 
-Sequence of events:
- - starts null-reading dump buffer, while listening for socket command
-   + for N second dump, assume N-second dada blocks
- - receives time-since-start, which is converted into a block_start, byte_start, and block_end and byte_end. Sets dump pending, during which time no commands can be accepted. 
- - Upon seeing dump_pending, read code copies data to output dada buffer, which is plugged into dbdisk. Unsets dump_pending.
-*/
-
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-#include "dsaX_capture.h"
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-
-// data to pass to threads
-struct cdata {
-  char * in;
-  dada_hdu_t * hdu_out;
-};
-
-
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-uint64_t specnum = 0;
-uint64_t procnum = 0;
-int trignum = 0;
-int dumpnum = 0;
-char iP[100];
-char footer_buf[1024];
-int DEBUG = 0;
-volatile int docopy = 0;
-volatile int dumping = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-  
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-  
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_correlator_trigger [options]\n"
-	   " -c core   bind process to CPU core\n"
-	   " -i IP to listen to [no default]\n"
-	   " -j in_key [default eaea]\n"
-	   " -o out_key [default fafa]\n"
-	   " -d debug\n"
-	   " -f full_pct [default 0.8]\n"
-	   " -n output file name [no default]\n"
-	   " -s skip N blocks [default 0]\n"
-	   " -h print usage\n");
-}
-
-// thread to control writing of data to buffer 
-
-void copy_thread (void * arg) {
-
-  struct cdata *d = arg;
-  char *in = (char *)d->in;
-  dada_hdu_t * hdu_out = (dada_hdu_t *)d->hdu_out;
-
-  uint64_t written = 0;
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO,"in thread... blocksize %"PRIu64"",block_size);
-  
-  while (1) {
-
-    while (docopy==0) usleep(100);
-  
-    written = ipcio_write (hdu_out->data_block, in, block_size);
-
-    dumping = 0;
-    dump_pending = 0;
-    docopy=0;
-
-    syslog(LOG_INFO,"Finished writing trigger");
-
-  }
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-  
-}
-
-// Thread to control the dumping of data
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = ctx->control_port;
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,"11227",&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-    memset(tbuf,0,bufsize);
-    strcpy(tbuf,buffer);
-    trignum++;
-
-    // interpret buffer string
-    char * rest = buffer;
-    tmps = (uint64_t)(strtoull(strtok_r(rest, "-", &rest),&endptr,0));
-    
-    if (!dump_pending) {
-      //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
-      specnum = tmps;
-      strcpy(footer_buf,tbuf);
-      syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum);
-    }
-	
-    if (dump_pending)
-      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps);
-  
-    if (!dump_pending) dump_pending = 1;
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-  free (tbuf);
-
-  if (ctx->verbose)
-    syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-	    
-
-	
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_trigger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  /* port for control commands */
-  int control_port = TRIGGER_CONTROL_PORT;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // input data block HDU key
-  key_t in_key = 0x0000eaea;
-  key_t out_key = 0x0000fafa;
-
-  // command line arguments
-  int core = -1;
-  float full_pct = 0.8;
-  int arg=0;
-  int skips = 0;
-
-  while ((arg=getopt(argc,argv,"i:c:j:o:f:d:s:h")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -c flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      full_pct = atof(optarg);
-	      syslog(LOG_INFO,"Using full_pct %f",full_pct);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -f flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 's':
-	  if (optarg)
-	    {
-	      skips = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog (LOG_ERR,"ERROR: -s flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_INFO, "Will excrete all debug messages");
-	  break;
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'j':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-j flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // DADA stuff
-  
-  udpdb.verbose = DEBUG;
-  udpdb.control_port = control_port;
-  
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  syslog(LOG_INFO, "starting control_thread()");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-
-  
-  syslog (LOG_INFO, "creating hdus");
-
-  // open connection to the in/read DBs
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-    syslog (LOG_ERR,"could not lock4 to eada buffer");
-    return EXIT_FAILURE;
-  }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      syslog(LOG_INFO,"binding to core %d", core);
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-    }
-
-  int observation_complete=0;
-  
-  // more DADA stuff - deal with headers
-  
-  uint64_t header_size = 0;
-
-  // read the header from the input HDU
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "main: could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // now write the output DADA header
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // copy the in header to the out header
-  memcpy (header_out, header_in, header_size);
-
-  // mark the input header as cleared
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared [input]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // mark the output header buffer as filled
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  // stuff for writing data
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  uint64_t specs_per_block = 2048;
-  uint64_t specs_per_out = 2048*NOUTBLOCKS;
-  uint64_t current_specnum = 0; // updates with each dada block read
-  uint64_t start_byte, bytes_to_copy, bytes_copied=0;
-  char * out_data = (char *)malloc(sizeof(char)*block_out);
-  char * in_data;
-  uint64_t written=0;
-  uint64_t block_id, bytes_read=0;
-  FILE *ofile;
-  ofile = fopen("/home/ubuntu/data/dumps.dat","w");
-  fprintf(ofile,"starting...\n");
-  fclose(ofile);
-
-
-  // thread for copying data
-  struct cdata cstruct;
-  cstruct.in = out_data;
-  cstruct.hdu_out = hdu_out;  
-  rval = 0;  
-  pthread_t copy_thread_id;
-  syslog(LOG_INFO, "starting copy_thread()");
-  rval = pthread_create (&copy_thread_id, 0, (void *) copy_thread, (void *) &cstruct);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating copy_thread: %s", strerror(rval));
-    return -1;
-  }
-
-
-  // main reading loop
-  float pc_full = 0.;
-  int block_count = 0;
-  syslog(LOG_INFO, "main: starting observation");
-
-  while (!observation_complete) {
-
-       // read a DADA block
-      in_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    
-      // add delay
-      // only proceed if input data block is 80% full
-      while (pc_full < full_pct) {
-	pc_full = ipcio_percent_full(hdu_in->data_block);
-	usleep(100);
-      }
-      pc_full = 0.;
-      
-    
-      // check for dump_pending
-      if (dump_pending) {
-
-	// look after hand trigger
-	if (specnum==0) {
-
-	  specnum = current_specnum + 100;
-	  
-	}
-	
-	// if this is the first block to dump
-	if (specnum >= current_specnum && specnum < current_specnum+specs_per_block) {
-
-	  dumping = 1;
-	  
-	  // find start byte and bytes to copy
-	  start_byte = 4608*NSNAPS*(specnum-current_specnum);
-	  bytes_to_copy = block_size-start_byte;
-	  
-	  // do copy
-	  memcpy(out_data, in_data+start_byte, bytes_to_copy);
-	  //written = ipcio_write (hdu_out->data_block, in_data+start_byte, bytes_to_copy);
-	  bytes_copied = bytes_to_copy;
-	  
-	}
-
-	// if this is one of the middle blocks to dump from
-	if (specnum < current_specnum && specnum + specs_per_out > current_specnum + specs_per_block && dumping==1) {
-
-	  // do copy
-	  memcpy(out_data + bytes_copied, in_data, block_size);
-	  //written = ipcio_write (hdu_out->data_block, in_data, block_size);
-	  bytes_copied += block_size;
-
-	}
-
-	// if this is the last block to dump from
-	if (specnum + specs_per_out > current_specnum && specnum + specs_per_out <= current_specnum + specs_per_block && dumping==1) {	  
-
-	  // find start byte and bytes to copy
-	  bytes_to_copy = block_out-bytes_copied;
-
-	  // do copy
-	  memcpy(out_data+bytes_copied, in_data, bytes_to_copy);
-	  //written = ipcio_write (hdu_out->data_block, in_data, bytes_to_copy);
-
-	  // DO THE WRITING
-	  /*written = ipcio_write (hdu_out->data_block, out_data, block_out);
-
-	  if (written < block_out)
-	    {
-	      syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	      return EXIT_FAILURE;
-	    }
-	  */
-
-	  // DO writing using thread
-	  docopy = 1;
-	  
-	  syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
-	  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	  fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
-	  fclose(ofile);
-	  
-	  dumpnum++;
-	  
-	  // reset
-	  bytes_copied = 0;
-	  
-	}
-
-	// if trigger arrived too late
-	if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	  syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
-
-	  bytes_copied=0;
-	  dump_pending=0;
-
-	}
-
-	
-      }
-
-      // update current spec
-      syslog(LOG_INFO,"current_specnum %lu",current_specnum);
-      if (block_count < skips) {
-	block_count++;
-      }
-      else
-	current_specnum += specs_per_block;
-      
-
-      // for exiting
-      if (bytes_read < block_size) {
-	observation_complete = 1;
-	syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
-      }
-
-      // close block for reading
-      ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-
-  }
-
-
-  // close threads
-  syslog(LOG_INFO, "joining control_thread");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-  result=0;
-  pthread_join (copy_thread_id, &result);
-
-  free(out_data);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-
-}
diff --git a/src/dsaX_wrangle b/src/dsaX_wrangle
deleted file mode 100755
index f839b14c334758201c3b8885fb58a899eb6e804d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 99600
zcmeEvdtg-6@&C<>gaFxypixm*1q})q)L2l^L|NTvz!VAMgNBf7NHipAvVmZw21Af_
z-Ab)m>J#guzG_h`B1H)ZBwDMnT8&CID(bGG8WlBKbbp`EoO2%=vS{1yAHP3b*gf~m
z%$YMYXU?40y*F2S3eQSRNHEM#l5v{BT-%`vlBW@wSJ*Up#w25~k!OrHjxq)TpNy|b
z*FWvk#mqDfwEQID%W{(vncF)ZrND-nnzb`c(YS$<`~Ie&W@;F20k9lpP1_+^q;9z|
z(<};uPnN~pEqRK^a#^}umM&+ehL+z<P5sD@-&q>pKb-~~KW0i=ev01rm-Bk{ddcpU
zj?jc=YL@#c%2CcA`pMLUGj)CO?9#`E!pBU_b~wJeYW~S59ba8OzPhTWv1xqMq?5;=
zeA0x5+6gC!-sDf(vu6~uxU{!vD#|=U!-ya9MfW=oWcI!=dG3h5509Gh@C(akf5B>}
z;A^&<yOo&{iG-bo@9FqvAA0SmZSOT7GWoOMsK<W))o_5~emtmrAcE84z;}YvK=g%i
z;QImY#<%~c3JMIw=W`Ga1ivm0{hgp^9sNIt#-V>6gah&aTO9hiap=?H(7zG~zA6rU
zS{(S<aq2xRPP@DthffgnZt&{=IUo-G$~f?aU@(x~M#rJQDGvPcIDDRt1OF%vpReQ4
zFN*`eI}V@1IPhEJ@cCyP_$_hZ_r-xf7^mKo<FxOEap<?j;s2*N@XO=CZ;Qh}Hx4`}
zj-0=V!~a)t;BUs^-w+2r9(>&R_Wy9G9>_mO#*uSg9QscIKe9h_<fq1=&j9~{;?E;-
z=s6w^B<C{F?`LEg?ryi#ZjP6`kQ;7f7`-<N*f;}p{A}0sdHv`&X?WH<Zju=$9|Pl3
zoxif)S6a@&D90!%m^Hnm+*j{gP}Sh~)z6xKW_4|iZ&vC2YM;Ulh+%o7q{LTWUsGFB
zU0YV_ud1yvsvCU1%UDAhd7lh|27g^y-I9{B%FB#~%7qJje!%MLt7`le0`oOh`Hcnz
zFe+;6d^JXGoxiH4+^7TV+Iki$Wtpmm+Oo-$OBx!=N^2^N2H(Omf3>hLExU}IN-9dL
zs!2_FU8BFOvJ{1V)xI+DuUuH}Hz08}1T`A`_2rFqC|lal;H&rRg8q7cEz0}rt096_
z-6CBd1*@)IKs^34DQl=BUBi+Fz(p142GNOw0t<b{qJ|o+S4l~QL*)j4d2OTLSO^vB
zmKYUf)wK;iW1%G*ENQU(f`!KX%YF5=sHDulq|R3|pJt|YO6w^ITCCncosCuHMuoCU
z#p3!Zzt7N>84E8HIZ(c-w4|b{rnI{1ax`g)Hn#{3cH{%0mAOkw*m!8QQdnkTX;lqP
zU0Y{B&$3!&c(!3hz0YTyU086&nI$JoIAKDrb$z0B^%Lvr#0e)^*Awl_lOW21g|#)h
zvJ&`cqWt5`MqTkY8Q&xgH?N3K!K^bCDZl(T2;XE)X_xMQ#TlRzmL}yA0Mp$GDgPM;
z=Vxww8xO7;%-PvF=&$@FphOu`bAGsgRn;)|uGzp7cA(@_jd?onKZxZA8JFt(4Zpq#
zLZldrbbf@;CmYLjUe%Xq)o0GX&HmeV7^Og6=KNe`jb;r`>vjvoq&M?f{oqaORKd=-
z6+JCuP3>{r<$&+6NaeTN0UzmrhaK<}9q?WU{3i~0p96lH18&AK%Ho8Z@sM!Nam`Q8
zTW+~b#V^~<{7iDdVOr~x=YZ>}yWqSIxVdH^qR0Vvt{>(&;H=yH%yYoaHXyFj0Z%uH
zk=8ljLmcoX2Yjdl-t2(G@vP5E2Ry@q8pbLI+~t6;cEC+vCVGtn?i|nVbHJVB)>;Rg
zYZmjf-T^<r1R`y7zz=l5I~?#V2fWh(Kga>^a=;IEz`GsrLmcq113t<D?{&Zrb-?=^
z@WUK%L;Ee;{BQ?6%>j2i;F%8i5e|5k1Ae3f?smYBa=^14@X-!<jsrf%0iWc6AMJqW
zIpD`Q;9dtj+W{|fz{fh^a~$wv9q@S$_;C(+r2{_B0k3nwIj1u}O%C`36Nt3g0YBaW
zU+IA7IN+-s@QDuiY6tuT2YihKev$)zp97xjfUkAHPj<l9JK&QX@HPkh6bHP+0YB9N
z_qOax^9ECj|K>Klt!@6qXs5SjYuYmwMKt&Fzq*a+m@Dw>Iy?_o%vTa0>5k$vrk;6*
z#z?2+FJ+z~G14aa`OGsEM%GIHBIX$aBWol-lX-^S$STR7!#qP_q*?N3FwgD~sgwLE
z%ro>w=1Kkp<}n5uks`?-%X~WXd6GYpd4|47j^qz!o*^&dmi%z$8R{aLl22!zAueJ_
zK9PBbwn*=H0F3$OIpi7ABHfbzjCqE#NT=jKVxA!^(kA(LndcA}Su6QBm}e-9tdacR
zndeX!Sta@Bm}f|fG)sO9^9*H?I>~Qfo*^tUPx6m4&(IYqlKcbAGh{{bB>#Kn8LA>V
zlE0aGhNy^J^4BxZ&=kp({58xU#=If<tC(jfiu8U<{V!+U&3w1y>zQZhiF8W-Qsx<Q
zB5jhN&pbm-WUb^cVxA!;vPSYVnP+H;tdjgW%rm4!nk9b*^9&`CI?12HJVQuip5#wp
zo}nXBB>7{RKbHAC$sfu5am?pP{$S=AA|h_d4`-gCA(AQibmkcnB8KD>nP(`7^zM=N
ze-?R$fJnFGKVzO<Khi1rkC<ndkF-htUFJDuiLABCSFA!_KKU2q&)(3C%lzW}S^0rI
z#pioNQ#UR*3~%6zr-uOU4IMgX1(IdE`jGQXjOIT@w=DMtlgDHeA6Py*jC5jKxEt*i
z?m;Eq(8v{F5!n1R`{l&8P3&HUA^+%0!h4{TH#BYZy!L6MFL~nu!$3OkjSb{8ZFE}2
zDir-#YX5jz;KT4~;Mmd^buFI^MlIp8mcGoz|MZq6w|En}qTP%KpuZdRQqd;%+KFxX
zTpJqU>rqqBT5oXbL>SQ93MQ{6qnhBX(P`dL?ja}ykq-q?OJ9P2r8hWiFzG@glLey~
zGHiqkK(n4KdnQA^;M8|uP~o=}{0ak~^{jOr4!*-)>R|b)wC0AbAQu+lAy8prn>To^
zSgG85;!rW0tMyb!>s_{$R=s#`{w4Wy^M96K@(cvXUuNpVs57yxJ>?E?7M-4kaU-vT
zV}<K*h=~v6`Kc+-!w`2vI?H?+-udrnG(WH{@(_qUt^aZbyg-8a_7;=5GFJF9u!JE3
z6^0Q+19=0_cmwU>Z;|x|-VXa07)EOw%01s~7_OCJ#8ralR%`G-1^g1g;Nej`<`o3C
zlE;(C7NQYzOdefAd^d;*-ms87Tq{H58T>`27yeP{g*#M210y^eT>>M5H?R%0kA>8(
z<zuM=D8v6k_YkRG5RUr+UJ$%ubY^%d;sy%2mY1Sw*wmHYz{iy3{coaCA@M@V%J7j#
zXa?^KgHDPN8kuGpeh?&pLg&GVs4y)&216N(rxgUY(@`_S`vZa^DJW79Xe$hSDy>ai
zmUcrAS|I}rHW|h<2KA}~QzIONB%6%O6Hsg$$@9WX00FCcSAk<-oA5!Gp$Op@1T}4R
z)3m@9qfY{lH1hBSzQQF#fH6Jr0hwg8OV#np&8u==dA*_Kqnkm)%%q;57DE24g5WRH
z3IiV(2EGZOQ2@~=x4W+HNI>73dfIk3Y}|T3TPIi!X3$&pq!v`0P#E|iJaIa3r<RY-
zbY109O@X0%7Al8L-QkLJAz5n_jqh51kFf07CM+kDCF}e!{7Y!nGWBHzPX@dNY9+W@
zIg$pUQfu3^(7XiK^7TNW<q89zgpWf@P78ET3zUz}X=!VAwVni8FLarNUVID8kB}68
z^=mY5VC%N8FQBhp4XBtARZ7R*dgtXAO?%2HXlYLn`xh*WHvbH3-xn!UdS5hZnlthm
zx(hhW{w3RAcx-~EM%aT2gjM7O^p|On>sd5sYg-fSJ+ic!B1RNBLmgaa1PVm1Rk83G
zC!Xwp-;G*(*xQ?~G&`imya5cNNy}C@9BgX!6*1A|DNBwA=$oOD4Jh8T2^Y(Gu?`o{
zLH6l^-L~+kZ3TFu1=)3j5Kw7SsW^O#f~KzGbQe;9Q0@zWLx4|^gclLi+Ex%~cP+0#
z+p^i=U%62r3j>TFqrYMe(eTDEqfu!F)k4EYL7ENa&{^b>ino1p5nMDppB8#ehJ%qE
z;G+aqAus$7u#3E{khk?mt|#&VAcKXB<s)zDN8p*bGkgPRv^#_>G}TiMu?!Azj}fl1
zsB$j_l?)tCf`XQ<3E{CI+QhILzT*qHS<9aE#(N@Ns%ab1Jhr7HzmZ_yKCKnhuu@A~
zf_JKT+Ai(Qb74F>8xkXODS5z7dEukMxG?Y~+ouO^FtIK2S6BzCT?6yk{9$L>H?kDf
zgzHhJV%5Zd6$D=TnK$sRw`FH<(X7Hy${S1AZo|qXnK}wB7ajTWX1B51b@()C(Xm_H
zMzJ@Pas*QE<UViUeg8q2f^Pv0r{KfpM!PE}nB&XbXE@5*d3Z#GgrVGbkraE!euwhj
z&EF+?1HInO;nTed+r2M+=g$JefMz(94CC=@p1K)iM&t3`mZ>GYDyBR`F|1vRTzEQc
zANE3p?J2)QK7ryi8~A+w3|K-RlFyA)fI{U9Lmb~*cD7+25UuYcno}IkZVCe5=Fi0R
z<_?afo{4|=2Ij734s%&=ps{2Ar{8*ONA6mR2J&oW6#^K42wDJ?7~`7>3~cd+ya^NA
zOnoMA@n=grPwr^^+}qN3x+gSictT^>w!Xy|=U-$F>vIc2^ZTC351o`)Sk}1J8(i8E
z8nq}tu%lq}2Z{NC*Yh{;99)?2RsQC8Qm2>gI4`vEHVQQ(lzbbjESwxYH{?$-8b%hP
zAh18dejA|QxvqK}Xb*)k=09?UH#2#K$oz)<zB~WXrQXGttVj^lv=xbxFJF-)*e&xv
z^LZC<C2r2pQ~_<%pilkYIc-Gjmz-MA(w0}Stgj70qw!V`1{7X-Lcgn%ATYOMdRg09
zp`o`STpRhD-%Bjm+?_bR?9FpR8&4#mh+UZESwFq(+390nrzdUuCNtIB(w@BSn>mBF
zeY4-$dk~xFB>&3WvOQC>g_E~8zPZe^wHXye?t-6helHCi)=mo*9s<^i4cZ-i25CNL
z?}^9g4U^GDU}XBo)j7t{q=LYw-ev7M$-|PsXxL)(;)0O3voP?2H?+(!cJJ{9K2!8S
z8{QNnEfKwW3q+jLZ0z|iy6u|_H>1)sdVy|E?)w&iInBuq#BT58&5h4^TQ+Awv$O&@
z$GtG6yI|R7wrt_#_QvZzm!5_BcHmoY0IstuE&NECj42m;F}|#sOTT+NT;>zlfm{|N
zfAQSN3Sc1Ia{2~3;>aTwA#x?*S^@=P#OluwJ1p`GNN)9K(7)8!FQ!z2z9*>k<vjRl
zHv|7?AhYMOkuyJ!M%kMnXpT3qG#qH$=?xTj1*Ug<gT-CmK&KaN?@j3Rrew<C?dkKj
zEDalt;}Hq2dkP$bP-pXo!%aC+L)tG8Q+rVPdUW4t<S$ew=n12PB?pVawy`hh*@?;9
zuQAkm(ea226uBi0=D8898jl6{i8l8`9o(P09Na@D_nvtWr^{%}v`Po_vb_OMmz58#
z62Vm^lN7j!X_q&_tUHh3c2uGzZO`-adOd%heTL>aANH99`*Z{vw+4#a+C4jn^R|1!
zOxr!(D%r^d5_Kp8)ogS_burLkKuANvK*vyp=RP$J^fNRv1nBTm`2dUoVS+}mc<a;b
ze=yW1A4j8Nr;b1ore>awU|w2l+mds_Ujh*%#UH`Ui#aSy!W1cA{6#7bPTz_ikss)I
zlAI9S|HjcH&@PIFSAK#Sl4l1SNeVXivAQ;CaPWK*Xw^I&XdW7v5@xb_4#z<B033D0
z6^iZcYEW8(B>XO9YWIjCA~zFFnTmG?r>#ncM}fu6%;5B$-p%hMdPAeqsU;!`br?)7
zlgw&R#Jl-jdR`Fou_fn)(@kAB_u9IG?=137N$#&OnL^<mO%8>BE24?QEuNhTw$hV5
zJ6%_uuk8ROvta{_eC^@KQNp%CwiYhiw86bCu)(#erkhz)XzE%d;kW*!OzLEHX$-3z
z%__%abun4}O0hamv1%l%-^8*?q*RB;u)>%uEt6%k$|S2*idC{=HHECaV5L1Z@-or7
z96KdV9U7{8QJpDWo{3^B=+?^)E(Q6ANoc)O(0UN(S*#kpR?B6pGQbNv`b>@qVuwrc
z1`<s>TJTQx5;)eW;iGnGmwrQ(+>Ex(e$tJ~(IL9gpe>78nfBoAAl4q-lhxwsP9P2Z
zP3%5-i|eX`(56Ap4n)mK7tfWBjDCzxwzMzMxEpZ}foD3_AfDdf(q3;+VqQ21OwiMp
z|0Fv`egG4xcCL?gAVB#dl<x{OzJl`T8PnTPzH?e&`bI>9UeGZr^f7Js>{^F;Z*cl9
znl6X}3_9R;EtY@qX@=0k^`aX?=n*JlMu=^Q{o(u2UMhrMmJF^e%m~3w6eyxEn)x>5
z*<_Lr7Iy|#v3?WK-i%RB_8d(Cd#*HbVAhKP(Cuwc?xe{5;MsBDIav30%9^!bfl2)k
z64iTU_~6LD-AM4ZczW@(v^UkYVi8*>rSu6*06oe$&@Wj0N+8oTVb2l4l>AscMD7bt
ze<e_ee&6EhBd@;HMO~I|laxpj;#u={v_Q(Y>zsAuiP{ofwgf$|z#m1Zpr_Y+s;8s=
zX-ld0yu28zwC7FgmoIYW^{`<Hf1bq%GQ-U2{j4L6NyGv15To0ttx|TjReeCXw&!JA
zRh#wF1KRU!k=pa-m^AHqT#HBH9O90iE=-BwiAP{LqCIiT<CZ77QOfj0+=yU=59={K
z@g1}x>&FQ6BzbQ@-guf<;P!GA=QQHiew!QjI8AO-$*uDvliP1JH~Mu@h9B1^gaVtr
zlmq7_Hs^zF&fHBkIqy%-i!9FESTx%ql4)0yN%ye^7Nr6WY^so{DjJ8b`N*G@L0BM^
z`(6VWnsH}F*wo-9l+~>qmo-fU9UiX~>x>2K*)gm;oUA|pP_yn?vJbvw`80w8`OdZZ
ze)pJD<h4$|w^)3SkZu$7bb2Ry`dnANhdDHBYL~6Z_26koIz~#)2-`h-)}hK^@tz>2
zhX}BNmo|`)F0={jRWv$hY**^)5zLMR)@+(O$Ps?iG+9px8Ng4)e}mKaFuQ@ysa*0`
zSP5%>$J_jlaPaezU(n*`B7?Znn@)f*p=YhHKdOubDMvfF9!##eCf7(Q^fk-&%n*?|
zFHqi-#E2#%mVZJV6fD#9zGyNOvx4{Q!8{91ph6Z6AfX%_(FpZxEGAf}!;ka`1|B<i
z>Y0Sy-&vH?28$5$_V(~!Y{DF9MW-YoCj}N^774Gn39}r8oYz@|nIx>T2{RppoS*yM
znmrt(J-iBpXff(JhI;0FV9D$@3)+)w9f<~TG}GMep50Q+YO!M;v0BXO2VFl$Wi6;o
z4U+j7<tUa+j+-#J%BA?J?%h&n_$9kp)|10B<VoC~mqsttwf8L6DrN?YcYDkJZp932
z4o;1O+Pf~FT~{IN8ma3-%f#|+mjSxPvs-4RK)Zu7hzC8pF^{>r4uYBH>G_4Op3_kO
z!B)oMnzpsY1)L!d;-9RGWW~fnr|{wFc9T9>2f63c7i0)hZiV(&uIsw0mZB-B2o&SK
zW8T%sSohuAEBCx(5$@Y7U$zM?uMFlTLoIYpws>-kBFo>ieDm8{C~^bxB5iw~?VEYY
zZ1`A@L>E}v2g<&n!Gk2u)z&;bpc`%`-#y@~-0(X1u9V{l2@xf<nZwH?^JNTRu$*ag
z%Rvz{Sl&Wz51ZV2rjeDnLmpbY5gTZpzINO(hE3sgb&bWN)JGzGi(0x<)rtiZ*P^00
zO0nC11uo0gQhWO!EzQO_WI3C%9BIjNQH)xMkJhO9MyaLNFKkvPI9QD)tJmNSVx%6%
zZ0JEAP6-_*PjcPRX3f}R8Y9V1^X+ff<J^1LU4wdR``1HzIqPZJ6~i%EIBtH(GD5CH
z@-5_eXn&4~H8C79PC_L{T!ci-BRQ80${duiIo`2bxRnxMTN0x>PXn7!@ICCtW(kE?
zC9u_OSs{<>4s2<H57*Ip`p??*HiQ`=^zxe|^xj1OWb-mv#;Y#bBY5dSrEe(rK&S|h
z5DSIt-!?t@6%|k^&Lj*|or_LVLV72`{=d<sF{8xn=vy56SsZdK4%aIV5V}4556&~d
zrQuW&9gt@T32=pXRt&SSli7Q3nc{eCadN<@!xAS49Fi>#9T4X&Cx<&N4oTpkZG(Ly
zWk7`CozC#Cri6?KW}p4`XX4z)p_bbp5R<V;DEEG6B~z?QZiiFr-GyE=hVPubW$|^&
zc#aMVvlJC6F(mEvhR#73hCu@(bo*NHY{0SX6F3`0qQMBQ1DF)|ZeO0rB^F~r7hI2w
z)zU7QOvelNn5_p^m~es8@P73vT!}8p5@=PpXF{iIU1rOcgq4|y_hF7ks=CRZuq=#_
zE6P(Pyt<Cf5IVWTa`7;+?YQ^UlqJuN1q!pK?JH$6=AR3~u%7TkCD|4#dscRa5;}s0
zw1l|@)`Z<oJu5uuem9hayKUaUDWn-V&%rK4xY2Beo`VN!gR7lEKzCW}pLzDQEZvjn
zABx{({~-J(HKxjOQBR5#uzFw)dmxtesFst%UP+Z7xnW~vNqF|Ha36$L4jPx;XGuHW
zA?@>*LE83z!nesI@>8@N&BpGU-1t*vU&eE99<Nnvsrm6IINpOL$lGP}KHkB54S8Q?
z@%{oRFoAIB#+(WS&&hT)e{W<J2((f%Iq#CjDBcyPTd*ae;|CX8EpqiA2Gr3@Io`u|
zo9Nh6*W%elf9f-P#V+)URZ3NBwNB^zh|VWgb$~Hcxmst#u9hmHBH@Nk1hpg>vXkJn
z>RmlOS39?QV;f320w%Aa!)_cl>UMhO-T`$OeJOM}oH{ThL^df~q7#TMk;d4vZ@Z_*
z=<(q<!89l*Tb4o14b2Th!RKjZNHI=$%A0J}#$+<kcIs6t;>n&}^?zwz+G8{h``ne8
zaykdn!=~YmLV)}?T3Q9I9Uxd?I+nQazzQAGPhO>}55A$>p`TSaoS2?$?^{7U@kf??
zagSxB+wZZAbSyQHcoUj>9Tv^uZ&-mC=|mm%W$bA@%pqbP_ByctSCpr`KHVH1>`_zo
z0RM0lqLCTsJ6$cKP?2fS`A&m6=H-sSO|c;Ov<t;U*;C%X+Y;QlX)uU_=Ual~?hV`c
zdiqT#4XG-1yi@THv2`=T!K{ung|s3yoRJaSLgW=WqO%!Ki@eHvd-V`NUG15hH88Z}
z4u9PcFp;^<)3UVFkeQ(Pa4C=b%_vR(@8^<QJob^8=x!FV1<5l_T%yWwi84SCpmoi#
z{LTed5V-0t%fw?HzVY$`nE0(O?Hdfs5(KQFQG$TDgLceiyVg+Wz*CiQrgDrd`gSx=
zA8wiQGy@0EY$cNZrDo=X?7H2+n|^sU>z;4c9r;w;O%=_Xhx^ZDR{z}ZEx9K;$~Lg<
z(N@{Von@U9d+lpovn^j+VlzC(!SEt7{Kso%Cn*QRcF%GUm`UVWTn9bNu@|^)EjyCd
zHu7u0waRc4Y>+lHGIKjIIIA;_C4wm7oxG)?VtebAC<u-#$++jBpl4-pR_m&ur+M<0
z##Dt(Xz{E}0OC~7^7^-%pOONN*zgj=#Hk=KYuj4Xi@E|}*Wzi0Bir=4tDp|o)jUOM
z@hp$Q%g)|p&vMsQhXQM=b+xG5>|Gt4*19U-xz^?gzq{62Z$HVCRkeuXG#>rBx>#&I
zIsDbj$_a+}PeC6~xYKG#XCF_mfZgZ5YTBJ+H`y!k14Z*3$^^wL8b{fZSgUZ+Z3d**
zI8DlxNMMas?-VdMoo|Z|^}hcP)A6yLU>VFCyixpj<~-Gqqj5NvhUkw~k<hdoJiY1}
zW@T|-fYSr|mp=dw(3xf~{8TmO+K+>Qb$6maCA>durh8hy2~dhmfCk9~C^gOmDAQaO
zb6|J^=Fp>O>m8O2MmucqT{&#<&MT(v`Oq%I5R8ZQ`XTx<z*E>haXjTHTk09IPq0}!
zr+F`v<*zK3&xmDMTuvy-+PhjGS#t`>1vtB?xmlWXz7?=uy4{l5IeYp&D;RH9(Bo55
z>Mk#=_yhIdpJ}mLVY70^zN^XVT^L-(MOowNMK75SxDl3TuJHoJ&ttj_3v=H`O*HlJ
zjIt8=KhI$O{&Y2#?!jzAFlsD?K@`>S@7PGL<-bt2$Hx7<fM;jW^L+e?H#T=!uNmXN
z{i*25Ho5&a%Vftnn)$^t==sgxp(lVw<mXxx#^)4Q>-&IFU%k-cX@}EYDW(jbSTiU;
zm-!OHoUF@Qw#XTXjS*TNnz+5e5uL&q$H`bXv}{gy47(N?rbDs->is^QrERGI+IFz5
z^8>e9Iy)!uL#Xp0OXsCBJ~)&_a7Lf>V&N6J6P)`k$#HL_K9u+&W=T^P?9$6Rr}8wO
zSGD`z8q+xTaxKrRZfM%#T5*GJ5_Gye%tSbUP>i&z^(;~>W$x1FTBnwl%iH{1nb;C*
z9pQxBV2FWZU{Wk((gcYySqe_WWC`OCH?h#zgCRuE(VWxPvY(tYS-DA!z!*_h{G5G&
z0tQ7>XoTGpxpjk=c^vnSWc9kB!jF~x7B#nV*a$X2izsfmj>V&-7v2)HI1Xc}+KGFV
zo0Y}7P-77nQ4X8TWaXB(szF)RFo5o36vk?+Ny<%*^I#^(IEa~JW)C`u1jHO~3|tk3
z^1<S;waUo^l*gRbv|feiKw~c=Kfazm=i>1uOcwV=iE!yv%gkz<lH)|6u^YT#IoaQW
zrFd;=fv%X$<D~AzsS*Nu&KQtjtcOfc81s79pO6ui!N@WcbW7Gd3D%-JX3Y5xUITv{
z>}_vjjFL@_J`P4>aaRZiBMTv;i-@b{>mZVK(6K1h&jJ()9PaAJdLCtE3>(DRk%$dF
zarkTw+g4MXyk9`9?ZNq;DwfuW+b>y6yREAK0n(^#1xpM!f<8{fJbiF717*Szq<7m#
zR!6V4Vvrjo<`m-z>`aBP`kOhtZc`yXEHca7_T%4L<&JQa`vc3JW0ec(a<Pru?|?$@
z$Ed*+0iIFvy>n$`NSEO)a+b}eUl(yL|2GPVUDR2MId`S5j@HjHLxijKIaL7Xda3$~
z(&?iZJthDV%Tx_4ypc1et^>S*LJ3FcrE-+B*AN8%7GH)i`%)VUQte&mh}yF-Ack(T
z4D6gk)L^$N9Qms`@E;!|j*goBrNrO`DW&^Zj*%m?0lmAcZ`}c}F1qXaa90E~cJVH=
z<@${>sg1HjsHT%@Y|htXGdX;TyJyg4c!H^Rgg4!&v<c<@^Aa-GJHqpxH9agzm1Yl?
z4aftj7&4Q1HlZgXZ*wHnB1ayD#kAZ#^R=Xy-FNSMYbjV@%{azsk+9k4TujY{eBqVP
znQ~o*I(hpK#W8o6y6uI#Y!HR@)@HxmB?Xq<B`#geP0ujy1nI4RP0Jp6K>0QeJ+SzF
zDEl-W5_WywYl*+`24&HHdzIU<cNTuuGV4P>h`aW_8k@UoVD78Q{W6RDjPK*l>4hnK
zrp^74-&nGbAqDHl5$~nsKFs3Isp&o&R=8`~A3fW$%=rUzKZ)G`)L}O4^@=+>uDDFU
zTWD;seFMz^Zs58)A3;}KwLLEn93^Vt4unaL>mpv88`S$tocHY1d+Jy*?@VY}x-)gr
zhXeJ`m<}j=!}<)_oVc@-d~eNCut(Rt%dUABYTkvKHM7W8EkCz<^fo}deKQtJqQSOF
z?B)@axe1A2h?zb{d~-KPbPGSaUG|svM^nFu4Ycrl<kk9R!S^##1A}5zWiwhqMh97p
z4%4?=S$(yaI|s+03&omufN_xry;v8^Ohj&Y)Ps+9l(2RFI|O>6FD~C^X)}>R+<?5&
zrpz30&B+Sxm2e)E%d|-zl?0oqHk;vOGu>h{YQSUh`iK7SG~r$`MA)#^(&ULb5aBuG
z{}W9#n+BWB8nS7z*!<sVlB(o7-?H!ty2ad>3}KL$*27e9fNknzh%>N7H^y32%?fev
zpCLDkCtqg;tHbEgI#|7r{mJk=OGQ~HaC(&L9|AFCb~iY7t3!{yj+5v?Hf_9fA$y7%
zlsg_dCKfrC2#(1p&tGO1Vfi{qX8t`6l$@)XbIGfATk}*c3wY9gm|Mx1Tg0VQhRy(y
zm)E1`n<VR3jh;-$aLUUY2r6SG<rkKyW07O>i-BIP7BvBuOMy=NEUB~SD5pGH3NUgW
zl63ApvBz{5yRnUkeV#rUh&Ver1kKnZ6AaFVxi;!kGiZ$U<{fI#hxhR+gUIYqW|jzt
zdsjl9Egaz$FQs9B1!7xYTOoTOJLTo<UB%O25B~jb7nVsJZWnG@`ijxSL)b3x#Vq0#
z=Ojp-2vIe2)^+S=@LC=DBS0*pZ8X{{Z|q&<E3ApUt9zE}eSCC3j?UI?Z07d<Sr!j$
z@0+8+-rVjNZkS@CvGK)xXi=J6VbZC0@u`Wvag)bXe<0eLyjq>&e>RJARM}so`d+g*
zi*#@LU(Mp=cD-=>|G_M5Thm+iz{o$qTbxCnl_LL(w>Y`|Lb(0^U=}skl+hym{!>55
zEY2d2N|FD?EKY9a!tK8`iz8mE!{Gla=5X|lA9Io&UbJo_&bS{0fF*Rq;h3pP>xknl
z(24>vrjl_t_8hrqAMwV?YchCY@7@t_8veU@gBw6qG;=n@n|~R2Jk2Y@+SN}JA<S7L
zC&nxe(jO(Xyo<*gw{T+uI|ypK5F>QaoE~msa#oKyl$eX9=hy><%z*5yCT=o`w>LMN
zm*7!pUT<$+W?x=qUV7V`SMnMk#ix0d%1O4p`R<t98b=N#SL^I5%XQSOnv*b$RKxH9
zfN63yw1)$$$PhJQH$;llKpdN`JjxY9K@mnjD0MBKmNB>U0;^c*s=Wt-e*9oE#UK3k
z;4n!mXG+X$JmqZW)aHMAn1`ct|MGa4XU8-=HnjIcL7u>7{X|eNc6am*Nh_DTVJ_^K
z#NPH^Is<dhov@3V4euxFLDlfL{nt4%Qeq;*?I|d&ra4PGtqP8@D;N#Snv>gXR?xC{
z1y>JTfm#1sHscpsEXkesb9^-&jE~)$@q~Rdb^&cxG8!i%u?`UEUFq4~yma?W|7rNG
z_fNv_V*d&Fy}&=t^)yvaIZewG%Do#Rz(3d1d&6(9(?*P(h%U`4V*5`MsibDzmxOgX
zS-V>AG;uJ!8^?P3-l><bvQ&54bQV=V9xWz~7I_AR$*hsLe52#Hhd1IxSF>1Uvv6*Z
zOeTw6@Mp!MS=aAcSG;@WDl<n`)wr=n6znNM9Wxtoh^^|_{g&1A<Lo+)0wc5Ozb!-^
zE6qB34rj@|D;Q!{V6{X{-#-RVZlLdT2T@|diBQdq1>B$S9jF+?B<|W<*Ua#Q-9x?W
z7tmT#uU~&oKAf>LVlZ@J8DX9blqv8khA?%xQYb`qc_t}CS@>A^!-Qp4BjteJ)czQ1
ze~sxBJ(=XQT?C4`Cs#e`2i5f1g;KSbB96oAN~=m;n!$X^V*iO`q_eK4J0ZXyo0R~e
z+#c+8qoi`k8GqKT+f$4p-FgMo2nSJG2UcmF94J~NV5zOd<%T6BXZ08GlA;B^GkXr7
zInL$|FuTqxz;~IV!mVeip3uc2ZRl(~;~fK-EWrfom1f|P3pUR<hU>V3jR0nY&1+CC
z_HAEO$oBAcV5438@hdIE9}Ozg@EI^dc)!O@*FH2+vBIjf$x*|tuUmW;+I*bbw;y8%
zJ^ZX`#K=D!x?F=GqJ_w@3y*UM@j44%YZYFt3#+k{7VYu|=V*D_!)JVkmY)2KzF*27
zYrB-$7dK;7W;*e7&B`?MVpr=Es93R-1+h#PX2k$naFeWkrs#&X!`7t1It4{xedn7h
zE4u`clT2skVL~xU;AqR4DmwGDJ?EEJBV;@DA1wMmW;ViVEujvH;-HZh4Ky{G*;r^+
zaDp9NUR-Kb;B1{PY`=$(Gb`v}Skl%~-K!@X!u`fBd#FRWtt|V=qgpsTcbjcLcWW&O
zO{p%B`|b;DP6s(SttO{?Oin#Ad|I0W|3qZ7qLal}xhagu_i1UgzR5*bxzZDDUK*Y0
zACBL8|4{rc_7B4E1+kkzYOZ8H^N*f`n<RP9y*>_rK|0Jfw{&qWKU#MO6SUa1eE(RG
z3MzB1P-e%wc)ai~GpKz1CwQtl){OAA@zP)pPj2G#_>-S;wd$Kt84Z@fV7xI_hhKP9
zOivt++n&&waQ>9?-Y=}ScJ^pLBnY4Wh+VUK0|=u<d-%Jrt>Smv#gB6|(p(ld9K|P^
zf;)N|X1Gl#BMQaz(!5HZ8(nI8=5G;yR0KusMU_f_MMDjauthlBAwoJuSi08M;Nh66
zlo=foP_gmW1N7{JP1ddJ-Kevh&j8C#2iv$Chp^p-pAP`ixTa7|c)C4N2UfTSULN*h
z_`-)FgCc7ZK5~x&Rnb`-?vwpkI1k?t;c7h?p&lUIW3bob>2SGTxEuyh4!95zb8yCl
zQI}iI=<Kp%aIg|?f&;1U&({&@k~FcDe0PaeGTULk{aNxftK?C-r0K);{wZW+W@iek
z-`cF4fw}K2u=*TUP(pO6cIczQ&&GqfT8@kC!lydq=wab?R^eOB!l9x%(L@biS#O&z
z0=Ci`w2rtD2X9kO)z5mHl*Z_1U~cuZUq7TJ?2)@xC|)O?Z??v6PA9-s>O~u<tnENj
zVZYcTcU#yEMQ?4;mG?XOTTV4!{3EK<J^d%KO@$XG1yg=vmmTXcQ4`Dl#!>d)+DLZm
zu=Ll92W;p|S-RL!`YF@I!Mr-$yV1*FOdY&1c*>tO6CNjpa)0Ku-%lSDuce8aPy#of
zaMHu}VAcd9CEG2H94*BMTxdH4BWx$9%(g{zI?(A9@liC7d3VR5Qq~S7TyQr*?0ru2
z9KGe^MFW~5mGzusx<Q1m2SY{ZYvM`%Q<<@*N9=X?+I*Z1@`(q05-mQ9H6OD%E|AW0
zsa^J1NN!H0-ecMAFs14&r|Po7Rck<`1h^Ylleg5%;Ym<0%2N)}>_WLOfrI7L8}Bzc
zeX7gTuRXZO!^;g7IVU-pY_)KzhrH0J>=Vk}=ww!FF{22|G_t0*4I{6K!4n3$=@APZ
zCd=Drkxp-reF~FZ>A;x(E<-J<8oQh622fir*ppael0dDq_#kzFRd?(D*{?ss<LpM@
zLgIbDQWU+>*$6+IhDOML0H%xLqxW=B!B*H!vE&>e3v+Ntt5*@)z4Vb5eLm%C%{`R+
zI2c-Ol6{}$$&o##_&Qp#qZZjhImgudDbzz~0y9Lk>9V#;EIr&V?NsR&macJ><~mKa
zhaQJQqolRyA7g09$<HkEsP<?Nj}XhsdxTW?PI<6FSq?8pDZs1XLb;bZRrLJ960--l
zO|uW_TEsKh#!cej7~-<*nV{RYsH{yuCpTwZ!vRrGSWeItqsb;i1+(9rN&SB1uq^))
z4QqyV=trCl>DNk_!kFBX(W;vHKkAiMV19yw`O#!PQ89-o(Z~$VoW|<ywTj<l7ypT)
zIA2^B{!Ez!%a~}yrHiXzV(qW3K^cl{#TZVpA+6dCYX?^09b1FCm&k(A41%&#i&ry{
zyytWW3%sX6-5QV}w+UMB^%4FWDNjkE3-NV4EuKBXSOuS;XAhR^YZbpZ-5$>ju~zL^
zO78FN)#3<NPtCD>0IwUfj;c6+QaiwAq?%IiTK=Nl#?7d8%1qbtjj<pV&%Ucu%^J#m
z{tO6b&aCga+w#)LU1p2wj>y?(7(2Td^ca%F4Ijm{*_sZhAZ5-n*z;EFTHrj+J(*>w
zGQ9g9yQ)cMRc2J<o620vmsxhxl`^7?)z1g_LJYozrb}j^v;`hC;tUiOOYf>wDT=5W
zC<`D8%FfAaccF3_kpWWV9Fp}2EuB%aV_{?)4aZX>HE(sdtUj}m6F-z`56^zh8n|z(
zRR)IYW1xoFDId;<>R;Pd?#Yv~_Q1{eeaOJwgY9%!!OvS<h57hyGIuSnM1Si^SOst&
z?t1OA+(d;6oG36tvyd4DXZl^vY<=T!ljY&ke4f4@c{hhie?ct$V*ScE=y!v;@5NDa
z$xj?yl~>(Yqm<_T3ZdMO@*JM!`Mo)p6k&Aq<8xM)0+yc#ig%b2x~U7?yk+T*gvNtS
zXu3evu!i@GaPh&m#S20jE|RO`IFTYgY%PTUgq1ELrYrD}@~UnaPZ6bjx=@K7%B^*Z
zeEpp$p(meun?UMVv{35codc#bI-LkKTN-483^POyde>|A++7Ogp5(0Jc)JSOjxqx=
z);=P=o+?WFk3_r2*ov+Rzi$t(glA|YNPfP`gPFqrQsvnJ!s<q8XioRxt;H5g<v-fl
z4tEwo6{o7aDuz3YQa|^)j&181Nh`nfQ44M;9K3~~EuXvQw1IaoOs$AI_9l7`M9+Ti
z`q}1z>wfZW)#f5@FmDdUv=7UR5DnM&d)f`_%k|!nr%gwti=>5PpXTx0;jrEh;PLJt
zIB&lvf)boO=&i7JBOeOwLxs2{F07JWOtyPI1VrvCFyGRG)@aWgK_oQ8Wu|>O)V@r!
zFO#iHUa%B=y*>QK8f72Zy9y7b3-O(AVQ<-UFa)~PbFOv6y{@PK?oGh|PS+EL>xs58
z`~zWFSdQSEQ<|54Xf!7C-ZF@p(1T}bKQ!>7u|bU^+5D>!<P%XDJLh%`m_KP*x;w*0
zaR4+?qgwFfJIj=HYN%_SOe}otp`?Sn8*alJi#)q}l3NzaEqX@N#PDqDK&kQA<PVT~
z8}95g@ZKEV8_!JP^MAWUQinJIVUH18jC>ysD)|)c^}|g=gUDn2T~1rRL=O+;-a7^M
z54<P?bNF>A1f`{}ShcP+Yy3~t(gy=IXzm}N)F`uxAJpEqQfSm<X{ip?QXSB$6Z;Bi
zA*EGPT&?cE{l91|t<~zJAEeb;W)=UrR%!jU`sNgeR^xAt*H$k>-~ZSJwN_iOTloXp
zs?Mz9Ki6t#f2~G4wYuPzcv^i6eg9jnc3~R!1GKu?tl~e{Dzm><Gq7T@JaE~~l%4le
zWa>w)@HSPH7@(saV9M}cbhMPXY9e$nN`-MFF#^aG*y1?U-j{Uc#PB_^_<s;fHc$_A
zxe-(t0aZO{!S@Vck7$xZ=g{v&X9<)?tXBHB=Ve)uKW~(M>5k%~3^ZbDb*f?fa)i!>
zR^5PUA2Wd>6V}pq=#@MRfH_BmiZEhYNCxFx)Z1_P4amB^o7hxJ($bc~>o<8VCxWs9
z<;r#q)^b*>4PmQ=Qhp;aM0`e1P5?OxP_yj<)vn0m{@~w=oHC<%2b{GG%XaaY^E20e
ztxeFe-|*h<xwaYa{rJq4DcH|k<JtnARiC+z<xggaB|dXq(ox~c-V6?&HN;#C%ye8`
zD4_e0^WcXCR}zh%%;^BwrPh*!Kf~TVnUY+Bf|<kK!gYmfLdPNu4ty4TRZ7AgqwyY;
z8-ku2@Ve*F%0xT?y5XromrV|IeD%@hZ(M<n&3%WD-4f_bXiLC1RFs4{r+3TdzC^sB
zD6nPoM~5eLzSI}M+ca-T=xhm(0h54db<ndqv~qvwv7uq@!UGhmREt$r+v6DU#%>OD
zIM@YH2MV(iG*rNIYjFDN0ZWg6<sjVs=6NMJ{nibyH6QmXxrdU|H}|E1g(+>wo4Db@
z2Oc<<$d|scz*GX2pz7xWd?98W4uMz4Zm-&Y0{T_pg|XW=EU2ic2=vk%iGf~i4r=dt
zJ~(~lhHa1i{@0caLmV;$JRLz#M^#(zr_pEv9>)i5VCBk{7Hvul?N+{0w{!Em$z#O~
zp!ci{c-nCA1T(@RIU8&b&$vmPcRjoj-=%!Ody2d!0S|beaHHjZc%}E}@SGj}91o9N
zdUjx1I}W_m8bhYN08iiXIeZ5WC&fN@UF6w8cX{F@x(<9!m%)13gWaIEC)tTN@JH4o
zJeM-U^#|hK9|%UF)VmuX)NMC7gv#8+!~F1t5EKIF%}Kse`f9l41{ZMXj7hmnrlukZ
zUx(7<h()A1kL0yE)8w@oT6paS2Tm*do_^yE>Cs`#dIg}nrG48lpMfaK#!L3}tzChl
zarTW;nV1VZr2878Y&e>nLb*Oz1kzO@34i(<EgkMV<6(=sGYqXX4pScy8`q;iWZty2
zY*!9^${Tp93u~YDR{nV<O5EPM6IcASw}$nVx4pGjqjA}1Um74|AwbjY%S`(+%f4iQ
z)wJ36WsZG0Nnf_N=HZGTZ|KG%Br=71awO{sy!N^G!S>c!f~JUb%q!1YUU^$Y9Yfv?
z{xR&b0biQxloOduJq<~C`}H(cEA_^vgIJ5(>t~m^VJOrQ`$o<z7P$*;zn&FD-o)%=
zQze*&52wWgSV}5R@^W&MDc@r$GF(a-Fu$b?>#)fO8FnEFpAjQNCf=pM2Ji+$v^lqv
z1Ku<MX?6f<ax7N~%>tBW2b3lU)D37hpfo$6G&!I-fKCFGW(Sle2Q&{*FQ7C#pfow4
zMS#u$ly*mU-lqGg-{}<5sUHL~^`p^5ziVKf4S7nxS0{>oM^V4|zh(tCc~DvsRkFcs
z{pKk-Dpkd%esv1kq@Y^A;%1C|D{ak+(Y>dywsk6AgZzEAHpOd^Ur%+QT_r}&Hsm{y
zuS32G`A+1!a5GFZrAIJiA~&f_#{jBskqN9^t)RVttdUDmDOiN_BU8CdwOw}}cDu2!
zd#(e)AbJ0Fwjr!wVHQmz#nmws5?a<n^Jq_lW)<XXwyh$nHgDH}wVfKnONS<_C~G??
zhQd+XKu-hfZCz}Z;jOidjekKCZ&*M>gmOQ`v!-z89wgxg;fm>)>QC}8%vTq?VNjSh
zc7xK5up2%f?xB?YIbxG42H{>-dwi*=mln_{xJ7&T;rm44flz1tN|RdlwK*PZ<3v|%
zD_b)&@O4JkC<NQELg_X{>E@ztzf~;yoVpz{ux`^xzj&3YTb{&S%($t*J-AN}{-K{r
z;uHA7@#&eY5qQ17sqy_<qo1gLih7q4b>0MNhH@m~T_LfAB`U*O-9=V2j7AA_vvf1*
zpGUJvGeng4E0HM8G5>T*%S|~ll{coET4j;$xodTe&cRP>d?==Gcv}O4n-mr+tlDfz
zDEVVSV_RiFtvQ5VppAs!Yd&?O-<~NqbizUL1Qs3ia<ssPH<e}18ZVYP2TAzvrkoB3
zoqeHYnQVt;UL^gaXmsVE`A+eDIHj}2JzCu`s;s8^<tpsr+5`jYW6An4nqb2hO4ggl
ziLAFH2_HRxtPjkwWOeTLzfAf;maONQUd5(qd6S~}%}2W=))WcMbV9dNwUQXs_AU7=
zN<Na)hfr=eP7M(Tb|MK^2d&nX5rT)Fy%$)LAL+32?<jd8{8d@m?ho}e2*f|@l^<@&
zooKQRW0gEbN}gGiXU8>r%QMfG$7#ullxHI>ujDaJBUU^3jeUrc!@bIWb6e{m$-Grd
z?_bKPgZL?(a)Z39Ae4JMPEKLL-y;bht0eByoed{U;vd5fCjHq;!x}|@7d8AQAR3ZB
z(;LDfQn}%xjR31tn_k{n$Y>gQl*PH1kFgsAB9C8GX!5VKI;sWx@F@plG{=y3@us7l
z<))}|JaV<lb!@`Or7O(0D|9WNtpd75c>J>>VmG8l{O5Z1CH(oIw?((y9Zby4v(Z?f
z@f~YC3W9<R?#?0)sUr6~i=2r-fo`bf;0i&PkpqNuFv1w`>E@Gt!iwNUnmQ=d)JX1+
zloB}kkOexWN8FS1xe}T_z@r4rB8svkRUuMZn17XZLyKy$jfgL3VHZRUdsou1nw$K=
zAqQu^EJdyBe+@F3_+pmAMU48xo>;_mUF}%J(D|TABg}&1OKT)$M8X%%_86K?2?-Jv
ziDRG;-HgxK*1$Lr*G$;U;yQOgt~PXUu7Ra{M4Nukf5^6o=npN#ssXVoj?FOpL50|!
z!mjs}bU8M&vX{tR>d**(A83WCAotBlJJ^2)iVE`9y^wCdiZp7a1tQ!u_gZ*_u2iZ#
z)e~Oa!Ygb0#JwNA@u|L#S=HmclH3@UB*dv}VW5>$*1~{nmte?qyT*E&U1O)Oa*Zus
z;~I;<?>5%g<{G=O(>1oS+ckD+uOnWxdQlXA!?nFN^Al@@l%+4*TZfuJECyI(uvLj6
z##$@uf>t+Zu<~JI_&GBbKq&WXx*cqkVwyoAc&aCr^}wwaJ8V~L+QD>Bg$7Fzi6gks
z15VC7P191ehbOm^o~s~{OL&v0y)_%ZS?Ejct>bWo84J3A2X54wgRD2yTEu`5Xq_av
z5UH^Q5vzt;^N@R*uT7))_b$g7hZv&eDKW>A1JIeWA#nIyXr_YO2jp)GjB{NraC<ln
z{RP#~qTW!5j8Tm?B?ZM07ClBP5(f>h21|sFabPB`K+jX9A<CP0b{BQ;M1k;&&|D1`
z3oTj<csEK-x($?1kEH!=CO3fzUm2uYqa8CMpaWY}Zx6io*~8vIhxe;a>~OM_$e7Ss
z(Qc%x8_vAJ$LTVN2`8DmEnC6S@>Y4JSKz5M;rh6&6$3Y_73BY=w{l?#o1=M0=_}|i
zwFPA}K=<cin90NZ*_5W`DJIr<By;WH$WP95BRVSzWv^mdS$s;0DW5{g_SPKyDxW$D
zR}P<=gskmTd6F|%xuQD^NuUa@vrs`m<POR&)}>xxoniLpS6L=if0GU|4d;=vY-GTf
z{+IJ_vcy=z1(Kh$WNb?1eu$eA=(oQD3p64R(fEEB5ccA-m#7A~v;<U626=-X9>72$
zd0^h+{ua%k<M{k#dN}AAs(da*5-Vi8Wgu|epkSWOE2sV(q69p<aoA`yj`jvqE}lV~
zt3ff8J5>p+Nc&JzPZl<#Am#{mD#U%2dl(qXV^x|LSo*vplJVWBnE7<UlF?0vp=kJd
zTEM)6fPb$PJj3K-+N=@pGe@WvBc*Jgu7nEZ?!-z3v`U}_C=huG?C|P5dz%lOK5nKw
z96hFofkWE@I99P&mY(eC_EX(z&6&iZ+^3z~2ZO$6wl}1QPnm>tksy@&4=3Fn@MBIw
z_yYDrH9M?qamZ&asU8M>cnMY3M(8i_R1ug{4?Lc9C57iKBX)TI8gR?NIf144+pdk>
zyahXe%?j%@d^f6<V`<zF$5SOy{dTCL8+l1VcpezZU*$Vc8lfF>`wGRt!A8uWP_B4a
zLE!6rxt;Drr>b9~q?kU!9*PMPnqS-rVjru_D;tv0#^Ufdztpun0z^Ro<3LLET-A*D
z3vI#1U3{7le|-{7^ix$^Ax)h74d&if3%Al1En73C(w-4AOUBF9U>)v^X5zy3IX6ra
z=m>mT5ttsX2o&$EfP3}xLDv%=pgf-;C3-i~Zlqs&L(@NlvE=1p%3`;oLHe)O`tM{{
zLc?dxzXyxIr}^E6<**9=V*P3+_>+}^PaO3xAH5P_{F%&mP|W+}@$-ER{u2Mf4RwHS
z>W~s?75Mv!3Ll8T2Kb}s8;FsWIZFP4UJ}Y3Cl-1;Jcq`?I_@}n&0m%ZE>q*CXq=0<
zJAmscP=ALvICb+mXob+okFe$v#^GMj@@>Nf-r&@`6>=3O*66Z<*A>MJn&KD<E#BY_
zq6>oJ2pMM@)A0x8sWk4O49`SmEiXpHm!pjR_dkoD!hF{_|1VsodnbQh5P03y@^4;;
zk|z}EKRD}s2{Cf$Rp*3~6S3Dn7+(YX{0V?PiZl)R1fB^ELB>#L0E1_QE=k<;dA=)i
z2r(JB+Aj&{!2%)!*M!ZBhO)FPyF;+JNtAyNwxDHm!slnWGDlqFn(*A>PtOTWyJ}C*
zFk9Y&KnH!cC7g^u&>97&ykxLi94?sr?Be}VaKGehKthhgdmh8z!Szo5T2&56di{RL
z7KG})b6uSojYg%u(1rM0=6}<wUzGx|pBg1^fX!fpo1x_c$ZIV(FM3bZfTkA<)!b)n
z*~pK7T(}_c8vnj<VQ9>{Y4WF)$1^=YKk#iq;LF0X{FA9^-p$`8g<Zd(qx{S7p7^Hu
z_gnFo*zt#-@dw58qa*O=2cJpsPX1@(N9by47tb~Sdhj!=>^eVx25m)tpui>mG74ML
zFv@BhtIORrwSISFO?7SAW$t=kX}P<$#$8$7SW;E9H-2$_mEUI))HeEsi(!;El+G!s
zD80<*zPPr|UsYSvFgM*jzRX=#Tkmtb-Sewz%H4JKwPn7B2DiV~eP&UyLS0-_>n`_I
zlr~oTS)knA-~(m3Z+_zf_d*btF7Qc#h9wQvwF^K|<*uqJxy-l3eX%a)p5-}zR>>KK
z=bd?O$+?~j$)(l}!D5MKdd{0U%`?-cJAs6ikg2K$y0e~is&YY9b+y~qR95eUmZi9k
z6~$qXvf7#&UzwlUl$Vy5y5~1mRQT#4omj-St5|hlB2e0N4t|inuF(&x3ClSAG?AvU
z`7A80zsy~UR)jF~rSh_BUunIsydR$hKEJ!h*W{0-y;!lGE5tGK?aj9Ww(^yu!9`qE
zq}*3p?RPJ(@>eRWH29%|dptaBF-+qwEI5CbXNJ4MU+VWs6$?wNYEE%imM-!EO{GiG
z{HAImeM8mdaDg$^)s6Batz;O-SJp1{9p5;=vBuwce0izA^!SE4Us*%Rn2HI=8HVCp
zQCdX_VSVMc(x4z#jpi>^_OTQYCth4%TC<?qXB5@fE~qbENLlOsU^0J+ySM>19aC<=
zuPdrc7oe~Fys@et(v{XPXk6&4@f%~B94%T`THoMv(^o6%YZto5G^87F-#8`Vv~e)v
zlG-Fx(qCWdhUt9`M%gJ<r_`R(P+n=+ib_v|!qV{|>-^eASkwW4YnS?~Y8J3Lw6WRQ
z!OaQ<5(6B*VTcV_rglC|un6UAYYg3_GmFjkjA;}9!n)%ZURo}#VRNpn^VPU305`@|
z7;wJ&(lW|2VZsD6=PN(LC<Qj%V3YYlt{Mrdj9GyHZsnPI2#J~eUT%LoaJ|-%cjD&t
zgXbY{v~ij7-+vfElkpvdZyLUI&7t^a;EM&WF&tmcvk$;G3*UqB<$WofKyx%r!WZMT
zyn%Eu(joW`!`Fo`_9cw{@#Wbl4!jNAj9?ss@1gh}jxRo^^QWq=tYJw_SqXacDQM9}
zrPWpCZU7M$O8m#B_ruqfm7ojMmbrcP^|kesSChso9)oZ2Ed*V5PScpCap(r)+`Jll
zia567Goj3=Dj$Cu2&>9V*el&fOvR}XVLCio`G(4cC8g!%^)>|{kX;D+$z_SHm|Q%j
zajsNq%P&stg9U7J*g9%E(IgH!W*Gx5Q;^Km*Q669?20`5H@iSV(V1rypH*_f%z{}S
zv9FVFeEON5{An<_w)gk*KmWXG`DdQ%uBfe-A(?@vy751#&+|W)r@R<1&pdC&jJ<?#
z*hvk7@eOb4>!2IZ@S3i?szHx~dwa7){sHLVH|73paK-S;r|0AomyW#!R3;vi`1;x<
z169_a0#04#uimR5>UBecfl#cfAC4nhiHt?3$fQdv>nj)KAi1v9NvC|O+^2o8p999~
zt0r+~4KmEKM~Sn}boBRFx_(|ePz_oxUk%1gUx}}_T**&W&?zjxMEeOrgGs=gj6RhZ
zRedrHvlEGA4tX?RtC*OmUXe{51M9@rFLW{}mB9W}(KFnqxfk=ggu@mA+MwTCZi*V<
zOBPnuIEw9~z9oL2t`OBVT&@cZP*4AMQ$70PG39e1TU>f8*0|3qE-chqSo8>3qM_Ns
z=9IdsLEK51&0*L6^m|onhChwyPv2jMGYdWWPCw$Lutd!RPqF6rdkJIvbqsw!=JX2{
zwQTzgKtHg7wEWU<IDLuJy-d41=rDo9il?duj*P+ca?JD^;X|Bf`O2%8m<_Nu|9;ZD
zRe=9tbULuwdOGA$eC;Vy1(!~xi!hUwzNDos_ce)Vx<1{XmF8BrS25g4=cq^N&6S@+
zcT-){PR@9&1hdWW7$L0w86(F)1<VSx+?Ky7$@|;X?gIOww?$H2jCE*w@gKeZp-Z?G
z{0LKwcyxXlbS1K>06QSRehNArSkq&1;GCFA%Q)ki#`n{kjC4<n<ud@Ct+ELEz4TTO
zi_t4a4#~(|$7VhdUz^Tms_80A>&r_tQ#Ek9v-QM@Z92|<zGv3#{KB!O{ppvd=&{nB
z&4IHD^UL$S(@VInx5w*#beu89kbOVD;u)G_jQ(-?n&et98Tba6W}EfGpRo>j?Wie#
zd;a;Us)qQ#*h7*}e2V`bL&pDoX}cu!uBz=nZC0~abLbFaP1kRlXYOf;-%6iTbk4A%
zEo*}Y;LByBJx8lAUtI5ZPjw$NrtBEABP+g&PIpVS2$F~=<D#Sf0q7hv7ZUE9AETA3
zS<#hM`Y^A9kmU&0B{DnVirlSrvu8J^xSGFAVQj?|Grv1~OxajXsrgseHuz4tKv||A
zSGXi-_d(|vLnThLJ^S+`%%Q^q>L3#AgRgTA*-tOUe`c(_j<F$D4(GZRg7{5rgTJ;e
zW}1nxt>lsR`~h~;bTJMQV~$vU7F}F(#L`0;bIPeEWTt%duM*j}KoI0)1e90;W9rx-
z3*3Fvl{MCbttQl6QRzJpU-$Ua+^R=lqJMGyOwKM%e;A0K62oWQ{`yAWI1F?C#`+o#
zI5myf?X2_5u0Y)K_Ot|%PHi8m?HQ3^>X<Uq5tO{*2m3ZejDB(GwLzw0FX0FAx6E;Y
zNv_*%Z~hnL&zw;(<Lo1_y~oD67%PXlw7fK!s<+{C1$Nlg!RM4%Cy({60qkhHZhwb}
zv4f-Cob&@Y#$JA5)5kc5X^D9BWu-No3bJDNLhjl7v0&qt12>_uaT&LqwQ0C-hfw7k
z>#!;0(+*)vYIY6F`c^aRdgGSc%QL>Gp1nLHKK<hQS{wn0EtGGFMf{aM7-w;5gS(-w
zw9Ln?kt4)C+{e*cbFmJU#i!!y@Dp>z7Oz6L+QKy*Hg0(<MzJ6F_cqht2B_C6H*joI
z_32<`ZzD1syR*&CjX}vtXR)LxTz5;bQyRpj>nF0J^ZTl+Ptg`&*CRv1VE%HfwsDNq
zSYMC*H!QZRoctSnK1>^Z3+wz#WWo;HSLvQC!kO|aI(qYI?uDgI?)jx<mo3H=mFEg-
zr(nzp*k3*g-v)Q_tTT-<7mgpZaQv9^@ngKFjG2DQnDgCZF48@eN$=_2KW)7Ev)3*^
zjk)IK6Y`n%|ESsB|IIvXAm069|4p7pS0&i~$IdN&Wy}#aCcMPjjw`LEcPzodf}DGC
z%L*mNpQetQRG(Jm1<c)FCr}G#@wMlB11{AqI>)SGAVQ1Ip68YDkYpfQieS-8h+S0W
zTRbqOMW?#bfP@b9-5i??IFsR>HnZgXf{Q#(<Je}^R&dCn=F+xY<!VRLBGebQJj(oW
z@wQ!KNUZW=C$?k^+OFSWZzn};c?sQOXOqUlJ7<L^5~+o^?CwB1@wUB%Af_B3?!oEC
zQR&9C{8{-D2ud5us;Ww~56W6wMWu!sIoZWA(wOlR@v5gW<4?$Gz(KSy#ZTOFW-%Rz
zW9YbB!Q)6PRN+`nModHvbE97G8GLU!IfubuhzP{9MNB^S)@Pvl*mQOob+X9QKqy*W
zZhPPN)-QZaXH{&J6|MNHj*4Ve&apik2boiO@<R(C5M$xZ151F$nujs4D}uM9Tq%ml
zu@3e;gvx;~t?=Ve1K96jj)Oq(Jqs!-TkE4voL(v~tZS@pz|0Nyr{oETB%L|}hjPns
zopw<)IuF-#=0>9{kyax0;=nB%ui3~!eo{p=x`z15Xf%v8jC2purmAT4B+%W6bT!gH
zUy8e?kS?u>Mt2~63h5rCe`ttCN8#|#=Z{9GAkAD9jb4KE_f6606-XB^iAHZldJxVU
zHzGYA=?<jJknTbHGSX40hLMYN%qd7$;Krg$kUoL*3Zyl-UFcS%%W(tNMx=-0*1{b~
zuf~0^dyuY2ItmAm2jL#)DM&BIGZ&X2ZAE$o(mx@+73sKNp*+&7k?ug6aUIGdJqC|z
zjl#iW`wb|M^yr&V9%(PqE0Au$L+7_5J!3V>BfSgh4x}$4-GlUaJPbbyMqh$-3exM4
zUV=2`W|T+D2iR{#dMVP4Nbf|t1L+e;_aOZn(os0{+=X-s(w(=UJkq3FQ6A~Pk=}~*
z;M-6h>G0c89_d7+dytO31LbkxnvZk}(s@WPK{{&<$|JoJ>8(h+k#0o#3DRz)SKNs)
z83(QZLh43(&hMkqJfznkorg5zu4uFwX)V$<NIQ_WA^jL>H`1niP#%Z8e?sa;y5L@v
zN1FWylt=o-{dnjC>GTJp(YujWBi)MhKBPO5j{0LXnubH(XCID6vyskN3%^8q;iJ)L
zCDMnGu0$GrEE;_XY0|oA^m(Mmu8&4{A?-mr6bH-)Ziq(5A-xo7A<_?!RwLc-FL)Lh
zX))5ZNPma46RB@YG}?<af;1Bc%$3{F50HL~v<PYQ_Gq*Y={-nSA&nwki*(dqQJ#6E
zy+|K;9_4Ykz2<Kyk8~5#BBXUMpghuJUqpGN8<4I=>ghyzq+9-u@<_8@MtK}=*C5S7
zdIi!Vq}i{aJknQRMR}xSyHFnKUy*hq&3hf?k>2?R%Hv@D&u^kU(n&i|9_gHaqTP_*
z_ZG?_&3Zc;?L*3sPK<=hXBr7jnF)ssO-);skV-ZN&*XdOMWe@&P=5xdH;mE1lYUJx
z>f`}_M&?-=S?9QhEKX}SPCxk6V^187`|?zM(~x$|L0c1`KZIR^@2b*h^kCjV;2D~3
zoSQi)aS?u9iHn*tGZGioWe!VRRGB$6anU@FD{0Z3LHU_UiHnL77p8-FIwa2@f@U=C
z#<#L88od@NKPTbK8!dMNb`1gf8JQ~*^E0wqlFrF+m!@ZAP0Ps4&q$jw<Pz)pJoF>-
z>cw|Tc{IxV3-~F-HyJl6WI_I?5o?SyGBU4AJR>9PS4n4QxI@WjWMp5PGA$!##i0C*
zNiC^!Gu9?1%}z+qn3SK9lb?}&Mur=-D04;zghg@MjIWgc0<s;BRLOScut7-&CqN;?
z4Uix3-H_o~+Eru+-qVtlv?+-s!h>&P{~h9V9!M;jUkF<8>w$j~_>Ynu_@wufNkJL9
zfNxo-|CXj%N8*zf|32W4sEJ1T)<=tf(!C-h%V(j@9;%H-uV>qt<*CjUu*A66fY
z{sU+WpV(>fp9B2M!2do5f18DG0=~E*8s$ABR{5U_TImn`xxmkK;uGl;%0BCXU%C%`
z7w~@qett~(xmNi;;6DZar_MG=DwGBg{Q*C6F&=Aiman$un*{ucz&{aVzeg?m%>n+o
zCg>M~-_OD~0pGSH8vPyQu=G#5!Ll!3j5PU*X!KH~Hr{8+w;u48E91-81^f-bm&Vk8
ziB*3e@c(FzM$d@xpQ)DrWC8va#^Ga~`uiOE1E0_uA3q29A;2$nmQQT5?B4`@CGZ!=
z;AdO-)xh5c{M9k|t1SF_;L}$`qX)&<Z-iyPF5r_^VqA-{-z}E?`hai2vz{d}_PNNi
zPZs7qTktI9b20UAv+AD&e03;3eh%;lU>>qErhJQ4z6tnW0$&<~zr@0?27VLp<DB>f
zf>!+>_`d=_&xudG*ec%z{1D84&WXW$EPNmExxmNke_5Ef9S8iPnDRfj%1`3_4)|$K
z{FSzRbAZ1G_#>QnrJl&&1pK|g|HO%}x67{v{wd(&$+sT($M%8m0-i4}kbB6?_;r!Z
zzYqBPfbVnGpZK|@UlzuK=2h{_PXhi*;Nz8_1N;S;cgCx~3HaH-$1A@Y_zBm?FTWo6
z@xaF`-vxZiZ`7kkR{S{Mp+E57?t_08;_M#acgNW06U#o6fbRr<nM;`Vt+4se0sc+k
z2l8(>cs2n)3iEMH^X&L`jw(+OV=(9Rsh|lFVAhei(vpF5eLwI&kHM=T&GK9WYy*C7
z41SJPo@;|dygVn*iFb^#qcHbB68Pm#e4=u4@}C0y3g8FQk@!o1ZwLOunDVo%`mX@q
zz#8OeC%#nKK=cQG5b)O|fWXv0snv>68-cIG+9h5e+5!C4z^`}mPkhX(e-H47uZ~7H
z$KW>#-VNBObc|8J|IHa=IZt^;!mPAW9%!m=iALwd@KBCM9p(XlC-Cw5K{N1cfS=+l
zzgXG84cHpsHv;ePFT<s_3|m1n6l<kF#@OjT%T7Ci{~gv$tugp!3!jXM+jm$q4RO|y
zG{_oDMgt$gn(2I}jEOTWJ52+A6xK}VJNeU21yY6BsS-5j-W`o5$J7zEWLOFO@%Kif
z$Hd@|v}Cvs_>=D&xUFdC4&Yw{ew<T=q+=}pVc_4xdTk&bSUzni#w)Dbu87f5HG~_m
zY~bsGKhBBQ{hc84C<M(C(8L=Ps)4_4ANXs5-wgcoh|RXYw-2JB$^RkX7d{fdKRyrq
zOTb?lW5dfW8}0%==g-mTjo@YRPh2JKNd7~y=Di2_mKgjmE&Mp((;kmT$2jrx1g-Q3
zei-nwgKqZM#A2&_HSlKxe<<c2R{cp?)|~5F;HNzqjlSpPpZJzl{vqH~vG#p127j-G
ze;)Yjf&VfF|EY!F1$^aS;`ha&ShFtx{*jpS4_W2M0pAC_Ji=({_qc^G1pfMsSkuOo
z4_f$Y;1i$X({}h+<yTnvYk^+~{L+~AYqHw!A>g0e6phY~DX$g{Y`^D$H`?OkcL9Gg
z@FcV9FOce`{f1&N8V&qIPJH4YEcwO(zZUq1W8}NvlCKc>)1K)U2a;x3eXtt%pKp!7
zCb|~*&w+1@slU#u{~_SL+v3mZp9emGy^Q^1VnBvf|1RLKXpcsJ8N+{x@E->l_gcDv
zx7RGPramuYZ9?J~SV^PFu&S12u0i@KpzrL!8p5g%TLcJuBd|VJlt1IH0Y+K)HNM=l
z>3udD^$}pNvpv>2+sk#fm+S258F}>st+8bd|7Y;YdM+AeGyM4Z(E>kO;71GmXn`Ls
z@S_EOw7`!R_|XDCTHr?u{Aht6E%1M7fi^zOj!&~rx#Z=ie_E!%@_almn{giiKiGPa
z&!2y*Qa<O-PudQ*q<rR`A6YCT<ummBWNJM3H~2B}Uwjv>Wq!3@1mieIKAoC?TWI`r
zYCN_;<#V&nV>?hjSvuwO_s$QG&G^ZB%Ppz7*6!xs6+Y&D12{&I&w)DSeI)#t@Mi3}
z@ng#E)&|4YvwY0@wOK8?e@B&mAeW-w>3pviJgW1iT)jGfsm6b;^IBY8+y7mXrQ4-H
z$91~Bx9jv(oqnLxFLgQy_jmAffKHFr>B%}hOQ*ASTA|ZLI$fdDn{;}wPS@#lyG~!#
z=?6OfQm2D-10JB$qjh?+PS4WmY@Jr<bdgS1==3I?-mBAfI^C|*S9SV<PQTRYAnkAm
z==5lvo~+ZebUIt76*^s{(-k_sNvHSfbe&GO>-1HfexOrj_5b`gt=p{&DF<#^r?O|C
zdFCnZ>|%L8i~A=Nawp`BKT*A{W#TU<PMVN&(pUvGrhr~KtLh4!+T-V%JxxwJ@v1Ac
z=jiIpgd1*&#*(AVuGD#h!pLTwN#sZs{hZ@W`raEAQa0-xbmnCvIIGT*me2<oh8~2T
z^d-J|rhu9<F(K_;D3X|%dM5gMVsh%UpiWFn-O40k&`#iz_(vrNjTws)iHRw>2mwjA
zg3F*|asWwAPQj~)jFc)|Cw+mqHE1jkuoBZ!CLm}f{S!hA8h09z&mm|Hy0)2@w?U9W
zA-OC=J%fJ7(u2}cA48Iswhh0j+?7WCJ*Yp0fF$e^8mV36l$Q29ep7Z3z@NdS-GgsB
zu?fbIagZ+Uj$OzM`2(a)TSN9kZU?KhJDYhqfduyo!Tlt7SP0gT;7=qN`Yxea2?Qtk
zAmQN4@5AM=Q7}Yi`pnCbWF+7xeHJf`jF<3}Ud-z>NLP$B=}LlT^K#g;;G3Pki1+AZ
z6vH0rP2_8290A7ZOK7d+jB|iYznt*2j3uB?zk=|g8K>Yk{RYA_GkB9r`pp;MdPK(0
zA)vZGAmd%~xuqE#ve<H)kWCtbmKbsf?cvHEa^^`Wnc0QR5Kj<fne&J}OQ1KRDTbWg
z44g5TcAmycSne1j>3X!_5N|plXwI?72t|cSk!mEpin@l5q8|(^0He%df4T|SjH3v6
zgoH*$HsHe^y@gDSl%FFJCJ*95su>1@5t>TKemr+fS%<5%H2$Wx5%@85P5G+;`J4I=
z0=Gktlx_j?H}xX|tI)0~UkH%DsoYZ@_6GnnGyZipk_<m3`a}u#Gy%Q;h6aflw-Np+
z4Vj#gh9=6`eFOS+nvs$YqO>&rrWWF7*mw|*%Gh%k%Pt0a#&?R@y97jq+3+@Wqzofr
zGr0}F0m8#q=8@g-!$_4Vm@FgZ0_4)t_?vnke)dbFN{<PUzo}0V$kXDK4gvBv_3s3(
zgkCBC6d-?7KP0dgjhgbA0QsA`hrq`mL`o_&N=xH!>VEjy?_*kdlmPjgdbC`!<x(fg
zCEu5qIz=uy<48SQF5l<nOu77qmviNkc1^96OIjthK`z(x@|SY?H(mzhl2%H+K`y`I
z<sEW46u+qt$mJ2dd>lWC$;m&1!zHFA*Fl4!5&XC!(jSN2h`PHnzqknKj1TaW`K9Vk
z&x2#;zZnbCGTLDE%ssr$%s3Ku$^4erSs8B=@*S_;8FSH0nR5D-J$wrJ7zsq=48N3_
z1j$Sq{si4RQ8IbMr?B%ROU65VCNU|JDH?t+84r@ooZ&QbW?I5+;4^RdLY7HSxC@!e
z;hC&+sOV8Q{9p=}DH;Fpo0%D&a6R}m4gW369FV|%)jWJYbvZD>gUnUK=d;qRg!{p*
zW%z8?Gb-UpWL6Hpoeah#(34jUe}VOkPhdP<J$x(hnG+<lX82#3IbJgN4gWhcIg(jB
z{B>qdNMI~mKYS<*o_V6k);4?wS?4Bf0>M@ziTAkem-+_k-v1h!{N`cj0+N-vhWcjw
z9mO;6<fW0(O^&~(=4lzbNP4dXGWcROVQJ(c(SRCAG$7Q6j6h#Bq5B*A<)Zjd-tC@w
zdII4?8)%3;bzROn^W}QTryv}DHR~KQ8%7wuQZ6UJhQkAL`7vy}|G^Yw@Vodq9*T}I
zQbr>g!n?2cKZKAW|0304q#E)ILJuc2Eo~ltQ;YF4a%dD3BaTd5C~!K?&;*!a#AtyG
z$wA#CM-x2cUPwLi1Ywl}p-28iF7G6hQ+b)hmt7u^RmjWRA^iad{fw7i!RQAZ+{nwf
z!Qp^IuIA;rxXd_=p=H?Z$R}qU&X|+&CS=Sw;x$}?<_*-Hv3MOauFQ;o<RJ27?gx8h
zye5~)#CGKodJ`{SFT$lU_(6~pD9zw92D3OZl20`Tvq}ar$tM}ZZbXfl880sZC}Su2
zyfFxuh$|!-axlewkH{hP!Hk`}96T16cjH3yJYdd&nZpJHn}H+T1BcUCM#dEII&lAO
zxJ=6!2@@PRf+YX1wJ(8>s=EHaZ(ax!62?FfCSl1CPzaikgs=%{K-92^h(Uu3<0P3Z
zl1$Rf1R|B7fOV;@@N=zDvD&({T5+q^YNcxH()y`g)z&4pwYC1bU|nikm;d+Nd(L}r
z-psWA{=eV*B=gQa_uO;OJ@?#m@7vBT4!r~APB@$<#c-$@t(g#}nJN-`0V<tPN}bUd
zA~!f;BI$H(=rN+7L~s-Otpq%RL2?-*3AEQj`Uz7=mI=jyaAXXYCZV}#VYq@OVio!$
zY7S2$bY5sO)p^8^kUKN<eZn6}a82lDNE)6=@Pbec;VTJl2$A0m&!U<eL-QeB_$b1!
z4Q+#T;VO#Z(Hwe-;Obhy*M~kNKC=n`Q0S+qJzPVr{2)y4!Gz}!URmK{G$%Zd5)D=<
zzJpfbg{K2`p%rdJqr!_R=Ta*?lK37+iKngbB+6Mvf&>GqfEq>_(xD_22_R{rTk59m
z4ag5KuY|V7cZN^=2sxo+sb!5ss6tl|yprI&5X~RqRRjk^2@-TQkr#(3(rWl*g2SQt
zB+Mx`^brXyC;GKCa%P6kCiv6>;A=wDsLFL@2Ma=|SA|bI4ETmnJ<+Tufg3{$Nuy^N
zjV>asZy>libU(pokqqaer?o6WYECd{7M0X|`wSvFhr$z$zrA$Kx526m-)ySVlEVS2
zP&a-{52yM&L!Sj{>3HJZ7up4_mWBx)46Q^vO3S_vnhQhE!HP;J5qweT2`W2<;2oix
ziKd+BFAd#Fd?Ezz4&6xf71ZwQL$u*~=``x;n?m#^N9lBezpCsX6Qd&|_#Ht4PCS;<
zGoQro?I2LNQq~8-|JMgG{=eWE3{FA!1%o7iFi5%`{d@e>l#M+E4U{1wswz8t8$cC0
zju7FU_zi}315q~dGQh>52~c_26zYm_s0AifHkDKu2_>Pbvgy>BGefk5D4Rj>(aLVY
zsjTu!6fTUzTqagg*}{LJt0z_y7%0x$1P-Hb0<++x`X%5K8Vkgvd1Nap6b53FcFlQ^
znNlN(M$Zw=0Q4|vP$L=v7d?p*lc!jOFQP7=T(03@klHtTIoJlLOrm=yp)D|lDU%OE
zA{BZH@=lpTaGtV11m%?S&w?fxB+5w>Kw?#rFa`GN;Rwz^?gT4qRv*9u`^kM8rzX?D
zx3+7seOH;*zOTSrO}Pd(#{?)c<!+rMv!f}aypnicGJra1Ia9s~fORP)zlUVxH$?n-
zJzDnMITTq7`K>EV^1x8}0;>8-s`?>7QxQN^S-UB@7;P(`My&Qw@=HiYsNbx=QNP)H
z9vUjY3npN_M}4siDyX0(u4U^L({q&GN!jBqO%53<eg>p9L8os*dfM$IcB#qVb8x6)
zITUD>>GHE6WW__2o~ZM`g!Hr!@i%FE&(lK{Z&Ue6);A!drvDWLu_jxjFxvjRf=tJX
zSTsGH3et)R&<3YdNKg9(HEybP8|dvl&kt41qI88W|2*=iQ*=;knnm4f@40{I2n0M;
z)^xpsIu7$p#a+~%nL7P>N}ombRqFKlsI%hlRG&%b!)lLgqIm}g-(gX2;B}uPx|Fq2
z7kL0A>d5EHG>NJ;Cv|$Up*dC4{24Se%BSKjIwl*Y4q0c-JO;R8sC(vY0E+=szQ+`=
z0x`>4AGiR&O;+tJ>(l}}%65l3jyh=JJ@YZ9lDMuUisDCqF{0FMd=HdnDy!Xk6r#-h
z8t^-)q8k8wlPU^utNv&zvgWz<q?~$=4j?z5Df@_DhxG~Z8w>nTiOmeoe3#ik(xw7?
zi^HLcI0zA`iVmyO(i}b}4z<i-M1^j#bqolqsU7QiOdX&SyLO?mx*12JyL2<w0jNwd
zku+nj+l=4bfP(ta><%*JFi_T0KNohJh271>?o}T<(n?@E`T*+RwAXOMQkBFDlApj)
zZcfVj)(qT()FS>sc>axe&R{|jagHlunalD)sG*T5StH7N_DZm<A<N0gN@?R658JOL
z<Ee~u*>OgV1$OcEMlh)@5PT8{t%}`DOZ$OXPrNE(F9S2J*#v;fdwpWh7e)p4i_99D
z_+13|!kH4|LLb%*6HCD9J!JhCaeA5ar)kletQx43x>>97&zu>S!oNyTVKUeUM4s1)
z%Lt|rHF^y8VONdRGe<BboPj>9*RBT3T4GsC_HQa)?5Fze!&#`h{8VoZXQ8^qPxV1Q
zyoJBU@A#=cK02MM?p{duYo@|5(}y+68U)$Q7m+st>+_k<0hmR=;{Z+uQ2F0nq}nJL
z-vTOCN1FK1U*l+NzpqLnD`1bo)XFNc-k~Z<V|`Q;X>A|XG?gi{bVAq}ZrTeiX37S#
zv9NU-?h|W%2ZbNKHCow{n_naIn`N!b+cUf^f0ospx981mm?1R(OSt52;(wHNi|{Xo
z6>52R1E{<?H~+slMsxH@7(l;fN?Mj!i>(idUkmVuh~F6iCZR(rhbAd~EHR@jZ_lMe
z5X?B}K(Gz67T6a$iqHmLGSL)ql>3)ua57pKOUadzZhMILQA{YhHGYDvbtz|pSRdBU
z_X?Li!i7A2<=H+i9*20xUQ-rQ2DSmz;>DQW@i{+xR~Wx8j7P&+R$ky^>~UjPW@9|W
z^v)>UUxA^n+g(e}_lAg4=KwG+>0+iXrp3ZH+l2BWp}dPkxR!}BG(^r*GM$Fp{UdPE
z&NCy+IM1Jer*WS5G2{Q>Jhj4pz_eH@=)-#SpCau?B5fsHsu_VlW@<huYok>SOK@Go
zoVPfO8fD6S!WhS5>a87swdzhLXMIBi9|QoXoWSNN3i0?v+nIv3C)ml9j&_z?zZ9Nd
z7M{-#&%HjL9>2dXo##z{p3AIR7)4snKMT)NxXa3ia`RlC&ht^GbY|5yi$=QUxdg0q
z+q;P8%ei^BGEW%#h&?ZEyE=~*yO>)K1J{R@Z+%Oce?gdk8-VGDeN67?Y=JeZjAeO6
z$P3^oEAM9#=?Bl)sB+p@zn1G=4d+o0Mp)k!*uQYxT$8m36JsEFp_(I0pEHQ^W-iu>
zM7JqsmWA;1dSP<$UT#ml4NIgW0n*QFQY^+MNcNl}wy3Sdb|NP(M&baoO#(lS@w<V4
z+Jira@q2+EcJXu8dYMK&Oh>?s%@S7CcX*gi_TYyYe=C^I_29qE_<Mj~qw!62>7wR*
zn=>9k#+;vW;weh}lM_Fs1lb`SuTVm!B1+T`aL$`Z++k|W3qH{>UDeBsLOD-J&Z7&F
z^D&os7dfvRY2ZMqv$lg{4U|F0C&>K==UxE(8pe;BhD%=<E9Qy3J%?x_J2h|52bjp}
z`<bE)*1k|v@PeP*bN$yi_c`RA!MRjx6V31S&vNeJ;7+p=G%d4Hfql+CBa*7mhd^{h
zKz6NDg{oh|_@jXTro&v-zsLB;V9`Hfd{S3>0V<sZ3>{SI2PU_`9=FdRAz6X_r@bbj
z>Q^)EJkTBiZe$9yk_ORIrvx-{{SY+K#Ka`#5{FpTCjrwzIyj5*t`08e++`>=U~=>J
z>?VHKa1J%&Dw7j1%@~3~e3x_Apd7CH=}J-G=cZ{y-(F}{b5~;NZuML9Ucg0*6}ym}
zGmgv6Nm=`-&*o#Z7Ar<q>{ajr^RdSUCm%pkPZkHc9NVU{x>rwO57J62j2j{ddjg>H
zMJCMRLEgNMD>W<+F(vmuF7a*w%LTMn8)ZMuwEGtlXS*^8o1`NPTOmca?qQZ^Kp)=$
zGwLd`u`e^e1^AzN@Lywm5AgrZc({&Z9`w-XK|kPQzhgD2MZfoC2vW`42EAJ=4@I<u
z>I!p$YN@m5_h1(mYbD(zxDWc7ve_nDx<~LABp07VH%C^}4UWb2mViqlT+Nxxs{E(a
zc_$Ou3^-9If4$^?9ji%e4dt)$=AXd%Q`?obmU4TMOUJxFQ-Rs?jzL+#psjthiZ(LN
zCK;~+{S<v)LMQ9smW?d6#O)hMPVH(NRF+O22jeKO!7|At=_=t^;yX1~t1SJvVkP{~
zu?-qkr>r@|;}M{iJPk$BQ2`auaUA5;chQc$5UHtsz1_;vchV+`u{6+K{lH}V-iT65
zcR>8WWa74TDSiTzU&ik;b^N=~+KBS|kg)0UZt0~M3M0xNKzixD8axZaE~PIQN0k2r
zShf6R2$0vMZ$sF0!GA&lSOY-})$y<4GT&~edThFs)s@-pe?S_Qy-Jx;ok`2^rBuR>
zmKF$Il<7Lq<94~G>jmA3(P$FsgP=D2$_?E~G=_`W>H+G*+DErj^inV*E46@b!i;h*
zm-p)x;i4*L@gG)%jZ4r_`nT6NPQZ!&1L4NIEkwz{zkSAvyc+|f)XF?)?4Pimaz)U}
zAHnh;$A7sZsPV7R0sZG4SV_8n3;*Scpp{pF`ZD~ND}ovifj#|49$0k^1_b?&yEV{w
z7Ettm_*tvgfsFpgUSu_{g9P+nR@HbC^g#b*HI2<EME_;;8t;NJ(0|$D#vM4(f7!Ce
zyCEn2hYvLV9n#T%_>smtz@PrZS2kV+4E=}qHa?7Aq5tqptG<W^(0|^<K;!GEmHtat
zH~tX0^j{ilyZ}t;zcku-8wlxt?3;}Wve5t7ry8e&KmAXgu<F-1(f`=<tyS|O7yakW
z39MEqPyda_qX_+%9oM)E?CHO3P9ueYq5rZI)UTn1Jf)wwTVKPqX3=NxGZFv(H+)8e
z-KTi@M&7tFui5ZxHF@+dP2Lq7Aygi{OOy8n0!IY%gO}nwavOg5-5HHYf0$lC57UEx
zw9cT?xYkl5UR;LXQP-f|FXq3u5T`<VuAcusQK&+CuAcud;u$QY=j!<%5?owJ&(-rk
zA~;<58h-OXCOA?^&(-sFq>!0Ki;0i2D5Iw6JW5zPv7m^atLF!FqM?XxMCa$}L}L*>
zSI-}z6KjiZCC2$W(Og8&)$@ba1(5LEB6_Z#KguF+(pf~$)$@yVllqFvNU&m^NEO{c
ziLq83d<Ki?xqAM1s~(99i|Dy}{se0U5<80Mxq5!sq8BT67SVI{{7DwQjCpAhJy*{^
z!s-Lr?jm}wo?m5s7K!VN=(&1+wNBhrM9<apXY0i6Mf6-fzeXnx6w!0_{5tEKDDzNJ
z5p<tFN6U7wh@PwG&$Z~qq$gD#Jy#z&^4lOAP0!WyZy2SYtLNWD?JfK%@cCaMP!$dn
z$D66;!9sejo_{O#R55fzCGvyBBSe-!EjVl$0KzUuLSyNJSB)*fQ+*UKpy%rO$616g
zpy%rO4F+zba!WOwAG{KY;0qubNk3!gIeWn+B*Peb&R%e-2I)C_!DSl!3v@lIj06k4
z0xEj-WvmL8;53GwvyYlc7=Eva-&!i5$GC+Rfkx!y2gzv0&~x;n=^CWx=tXtJa11?1
zFPg7GdX8SSh`^E|jIz;%1Qys24M%W>pIQT61$OxL2&L4xHl5TkB{!rcOTJ7T#*=yA
zM&|1goWb@yLWZvfSYSWGY_%TWM`5kUQ&dR-5taR3vpbiGN=_r>M1wrY$iowDJ!<r1
zBjvc)BVda<{y~)G26?{}r<-h}ryAq|MvmVFq&1q5eurZ1dk}UpVWTD>jlG8HH(Wx=
zVoK7Dp3-(aq!>eVDWuBsD0A%NNYW#XiT?ob3u-8iJ~YJ|mk$e_G!80$6y%c%0K5-i
z@+U$B#1y@1n_@L#&9#0CT}keO!=n)(R0oLFczQ%Wi54<PW6?c{9=Yh%_ryl<D8I&3
zI!u(M+li9i3*17K%>ZryFnJ*peSs@YqH5d%(^Wn_1vi&AgGqS=xr0pUEPtmyLo62|
z>v?dB(5h$hZYH8og~F)7?t92c!kn&QqREW4s!Aca=JcV%>E5iI=Ik|PNHVNa)l_NZ
zZK$voTu5=jr`6O(E>5p%oz8gOwgS6bq^VfXMD#xC)HY9vOLLX@jHg7!tDJZS#V7xs
z6F)_Qgedrgh8y|)kz^U#wyFQLZSPdmu7s5YhRB3$vXJStFbfRrM0z?c%mPEdK~XjB
zS|V%KWYh_Utl1@NHe_byMFwnB*G~t4mDi)hc&P@7><Bf@th_WGy#jWmS$S!8ny%c?
z%_zd1(g6NLilCWAH;$4$Zu`^>S;&O+LZ;GPHqDZC!(kHxm9i`-c(YQL1qDwXC3F6)
zI2uO(hv8Mf-$KMZwGh$<&GcT1N@;rczmWT9$Q-CZrj<{w+n;UiBdom%*)IXWRWz>Y
zG&mEsX2<&R@GyUva!$powN{XbV#vwwLWWLRO(V`@@axDJfkn5*m*UE31U4AqWGf|O
z$||7x9!6qhB?+aIuOg{}=1n`iANV9hE(j7Ge1vsGsHD&iO$@D{FvcpLfW}N$g@wlg
z1@Xo!J9JFpqQVs=&<5l2OmoDMiZJ<yff5zX*Qp?-2wh-VD@rT?qoyKbHYf_~9X`jE
z((RHWMvPcdVi2QCEIU+KNOe)k&}gT?0+c^&g5~C63E&DfrUVJ%IF_;CNF_={a`^tK
zXy@Ta;%9t`hs^{eMo4{l4M2D<fCni>GHBXO$udt8yeGP?p-?-M+~jek0l1>c)z)#P
zhTV+P6(wV=u_GJ8*DWihhD{maR1|=&%iXe+jChh@G!;n}gK{f47fqz(G*1%5)0voZ
zkvzha1o4rD=;^T|NrV}bt=f`x*7^y}g&WYgnIlxK=2v;y4huxJAenXf4s5_1-m?sh
zjw%Exl{?y7ju2IZAg+c{wI^S9;%ra8M$~u_@QqY6?uB}=BZ%)D2%?N2^&%7D=T<Ax
zE4?){&#UG%CC`VFJ$geIqJ?P6LINhC=rM-vHOoqCwV^KZr0J61=_6cx8O14eEa^e1
z<5;iS(i^-YL<;M57cX_Y7+0XERvqspTTvo}%N_IxvXh9`(Ft4{-H5ADw;{N`s%CXN
z#?(j(SzMY38Dm34+j;0UT-K%I6Bx;>(opF3)h;644Ui`NZonP+c`~j@yb)&nYd1h?
zaRkOY6tEi65;Mj}rH}DK&zM7>6mb&dG=N7-MQ{e<PYx4*QY@o>jN4VF4tjAwm3XnF
zka6|uBy9nQyS-<|>UgIQ3Nyy)1h*q;yoFt^lq_{wQL@aFiy9~T2g4*8W|PIF;S4;(
ztlTpSv>`;y=$I<xGy*D+;E^#cU2w6f>8{NDp!!I!4b5<So!lE)#!O=~mFYR~0ci+W
z%uy~PhsC6elr|)7R|@KoJWmqJ8<DQdd`}X@BR#4GanPNg3S`Pr1(>J<`ZCC}{)O4Y
z!a-8vZ}vd4e>V1D>mF!}&mLSbFFLQjs=F&OFj&=*7?|DCl}aiviAi*Kwao5lZOx>g
zFRaoEWzu0o@N|kq99x#fQ|ZX8;>=E+-xuf72!|i*cB(YnnDY+M*b)JU*?7FVlbV`L
zwRQE@IWXD)7>g|IA&Ri?QG_5*b+vcJ+YE)S#UaLS<$!HmFLd?9O)(+B<yfkd;;~mH
zv2R+utx7UbWuM9MV3OD>jW*6@&Pc^m2JTAW3`4@L*G^LdB<$)nnJ2DTeF7kKqi%)C
zN8_T1m*3r$WEKMnfFwU!g}Ch4${ia(^pm{Y{6wm10G}yQO~CYE+x9NJg@_cEJZ9q#
z2i7$MEg}3(2C7StNT`4{f#xT>IufyNgB|SX8!)7cTL!wiQ&nAwW79eU^qOU?9a|%I
zrOr3RSQZ83zOG$uU2AWx4nA_soi7GI{wltK2wX@%XV_zZX@4frVVxIv+Bz&S^YGwb
z?Qy~O;KRtY?NPyMItO3SzuTz559sG@JJj6s$kn?JHXeI)UckO*`!Dk>`zI=xlJxJ8
z{<#YN4m=W8U^2!AWYN1mLvH(HWvywty05_=+iahgvd5;5vHRMCbsNBH@AhUCy~GNx
zMB*Pql(^Pvwoh-~X;<#BH>QF&0e|^+WLx%aR<IhWmxkJrxY|m+i^N+)!4y*0Tftx2
zJ-gpU{omQX1Bka&@L~Hdi|B9L7O1p<y2TWKW#~pE{-MAt&;FV9HzW=W{m!1-4ALJD
z?MCuZE0`Ksv1BI@-_yj8S*i1ocvL5zuu_{i_gfnAEOP((g{w9!0?F5h7K6bHR;n9`
zTZZmL;uR~^vp;WyeWT|6M^qTFuO14HMb@9Jlul54{*FrboI!P7s<rTe&bw6e`@q`a
z;4cqQnfJF*Js(@mO;?|D+9g1Iw2f4^Q)hmpGcOMWZ`72ej*A0}0sm_o!FvP22=w%a
zZKQ{X1HTI{xy~+p`Ye0vjT`Mpw(qz*SiA5z`>^(B?)u^H9}U*-(WcOuXVclfatIxv
zp|`EzVh4$KYUClS`M;<MKiu{-ct4LO*{vkRb3+fK@Z&15o;u>k=o=vas<n485U8^5
zLh>;!&-H;|D)>@x!%on@v5k5i;@+qe?RkNHT+dU1zy>QXpzHiLQEBL@KoFhCNLYo2
zo(?p#m=`&+e_}!Xra>1`A8EpmtlvS3hqp=k8zjlGciA(a-Vgj8+n_Ydekl-qI4}a{
zO^*u#>P-Cpe@_PgYyVI4ir10z1cYZjwD%hAh>>Imp4B;fUAfn}rrM07hI`6%p77r6
zTghk9WB}hoU`{uR%nUcGH>^NG96k@1YBN&`N@;h-H;3%e=x+N*ij4ezxD0}6+~2<)
zRslFzZ68T_*Kemzd{fh1zdZ%JDIoNgc7iZuJ`OJ56<kinHJgezz<JpAK~26o`0y=u
zU!cMY1ktCTNA&?bv&hoFDjDy_*0@HsB&1xAho?9799(tLlHigRtM+{JjTiSk2E(!j
zf^X0%e=+W;ZfI#}I`^0j8wfF*eiqTsDtqvk_NK+(skJ9HQ`(L#4z9AN?6BJ+zytyi
z2RAHIm>zo+HVeTfB$zF>#-k~<{DfuePf`hdcaT)cShPLX)76bko*tuJiFqm--GmJp
zacb{LIrC5y+r&gQA0fq~%IJzyqIJ<))YOI<DvCN_M22HD%9C)E=1ElS-SJ2h(sZmj
zBx9SgIW}rU<%EoO_xASDW>YFkHiBu6SU?`yEej{wBuN)ETbZ=Pl5w{IgLQMM#kc_h
z3Hsxk-JGsOtiMmS(~HusaF~_j{g_3ylnT$Xqx6W9+NGt^yra?HEs1!)s|x?_LlCO9
zGls|FGHFNC_VaO-+XvRzlnbka?tz|!qE3j?$cr0kAxt|sq1{nhE-Zl7oU(1*Dxr4@
zi}xp0EY;h^n~G^27)3$Gehjc!KlWe-i+EylSATB;Py!nis$`V*%R$Pl&2&9l-WJ+@
z(=3>xDeP#8l?$la&6Yk!9H@hKAcGEisF`MPN{^CZ!N8$%?wM#}AW2eE2XwhD=t}l#
z#nPGzd9;q%+M9^;0w@Zmo=v*-G6!wx=_58QUC%%&K8Pov?a9_yqFoInXay8+i$MAP
z+Jn)j9cW-*R{}(OPb7tX^QhToTS-`jF+(!IdC_Dml#dQ4X|^U}J!m&tO7cbfx(AXO
z-ASuW7s*?Cxebg%*|4BKyb#e(yQ(LZ-l0e{)f-5qNOJ?aKI-4vdEO3&UU`Vn8x=H?
z`mi!qT|F_h31U&@&_q9YLoe9*$h1KdffPx&R)>dauNn7==+<~YbZ@#a=?t@20s~+7
zn>2+s?4mD{wwNl&K-<yjkVTJ68hq3c8ekfuu#<X6f2;>?V>2!ZQeC}?sMce$tEZ24
zydO;U$6C=Ey*07)v}ZFr^ct=|+Ei}~EM~J|(ArC7wW;c*Ecp~;<)%@j9y37Z)kA?E
z9|X#@I$FEqbY&oy10aThz%FSArevL~RXxhLP?yo@#8Byl?O<zKQU!Qwk1#++-Bu_3
zNd#l~W}LWI0Mj%3D<-$Pwj%qJ;fs}4747YAi}tp+!=s{Dn=w%2Y*u=AajAj^1S9|(
zTq4l^gS7oD?*geU5u)i1j+1|vu}6kAr;hYOeTYWDpTH+~b(7BK>0Vo)&`oXe&0Ve9
z_`7=(9TEM5qPnZJW{h(sx2!iZ)V`6b(}Pg8clW~oLma4xdVqAN)d<PDSzx`UFny-w
zO&`YGc9h2(8@1j>JDVax<h~emu1|B!sotFC!sY3@bziF306dpiTNKjZ5(w(SP}II5
zN;@K|)YfPZuCOt5wqUzB@WH;UXfZaVgZ-*r>ShcF(xr@t<FzRco`zjt$%>8JL2r;f
zYZVhQSrs(c);rMB4M#|hh37yFn{Lv(h(oRk*SzfHVfy$pK0=Rf*M<fXhNouixH;WD
zGG*)P^}I+<umkgat5&r-Y0dh`@wK%P9X=#dg~8U^xnxtK8(>e*Abu0QRdd=By;}e#
zlX3G~XHIkUDH*3T$dFwh1K2uPJCAZ=o$WaF^lomUVzGD<zeLtaM18H@xUeOlx34vk
zB7&A4L(sjY4?uelk%1mSy@3XMDWj#Q%OsN>Bp;F_4hU14L2NT&@ykNiS35v0GP<{#
z+Cbl*;tD&l?O9b%e_MZ|4d?dSd98Fxb|lF}bq@Bnq^!RF_SR0)M68V}X&+4X<1|Qf
zznsxWl)rg!ZfzYkAN80-oJbAWO1zwmJ|@HRqdg7-+>#<WTf5sar*s;2x&{(T1Efd8
zsO>Wj*rLVK)ov!WI_$d~MVd039sQYfJ@WgwYZ$XRk?10Jt=(H%46B|kv91(bLu)t9
z55V_r8Zf#2ZGBy6QdM6J&OOpyTh$uFM%Pt(54$<gzCQyut~hrddRB7lyv1tkYZlai
zdukvNU($gkPk$HM-w~-g1;*B{T~!rklN6T8z5P{Kx?)VxnuYueIZN1k3w_hpmW(w=
z+gfnD0}e4BOAOEgK?Yb)yr;GA{A{xl6yJ~a1<moAgL}6Mi(tK=M90wjX-jmFro0tv
zkB3)Vv1Ub+Gb)XrgG(LgGp;VGnUaLY;cEh34%14K7ae+<=+QeDLey9{hO23kp1#`2
zOUld-HzdxK94}1{$i=kY!7eq$_q-G@&zu<svwwdIZ@$g$?CpuqHZg-{L!al(MssUx
zYcNQ$Gpr;A<B1M*+3aLLW}|2~1TZUjOzcoogdB~#6R8?V#yaB8T`TGcnjJBZZcfGE
z@?<K*iWB|_O8~PPg)({BPLrQ%=T(iFf@ozyiy2xZw#CkePCE6B1$k)RNkJlK%|n{d
zOP4mfK<&nSo-~UOOpuTgs|+(`>-&4r7+nCc-v)D@@@jEaLsz$nUYZ5kk||n!@?uTz
z$eE;-2z^i5-XDh(KR-!JNu!N)|C@rJho~?PGeOddjAjtRx;oPnvvW0|7ZYX)<4828
z4yHz0&=)o7q<XeA6G?aPmaarQ7H>2Y)~a?`FfB=F)u(3wnr#NW7ZO<I4PuE%MvMh)
zS3k<2-Ny1EBPBkikzihd0kvu)NhFDFR{|5N)}u<cVjtj5Dn8g{OfuP-ehZ-stGYoC
zitG~$M18^t?ZQ|?*JVsr>E6N2UAP-vMQ(+B1$>VlFg=(zaygAea-gLp2BUHP0<DGm
z;2HGQjB_n+0F}Z{$ao4%cMtIVgwK{*FJK4m?$cqZvjyDEs62(2PWDUc3_KNJLILUR
zemV@Hq~!A$a1>Bp&NhUmkmr}Bn(9c`B%p|7snZ1%d5Bk&Bb|J$2SNemBP~NHDG>>T
z0(zxu+V7D|gYq@1)E`haPOT-Z6T%5x=ICjyfT9^oohG2vO(g~XHr_O#17rqpfB&5j
z(i_A|;Ek>j0=|U@_H>|I1QeZEYNLQ<+@+xUccQ-JkZlxDzHmZG0p;5q@E35M)5fy}
zbep}=Na+4v#uo8KKHI9`ll_vA4zbiZViT`9#(d827~JMG`y6Sb=o<wy#X85W-9UaB
zNX_#3&Nm&4T`Tp;x3893D+*lfC~NJfVJ_{HkDDzO6>ymA3#i)t0=sI6iW(kt+8h;7
z+HR{oyM`p;Dk3VN^rois_oGH;sx0d7NloW(stn7AT+<YAk0bBtncPqJGm<Za-f|gb
zmdeode6C}WQ`cGn&v8&21&leUbpndg(754kS=2SWEsMH_w`I}I@HT&wPUGs%bgEq|
zpj2<EW&u5tyqQgsH?v9dW;RLQ%#fsszbZJB&oE;FuXa$*01x6dJvu-opnU8HH?WV#
zx;wznHstQFe1$d6;l0+cTJRR0KX<EhSr}*sYwmNd1_cy#VYp}VKF8l_pz?PaQ2y#9
zHiXMNqiHoP>^xU!0p-I@jBL@--K;CLRzO#6QGfL^V6JytE1)ZpGr!50#*~-Y+S4;z
zn=#^~I+-`f1<FTgQXk>u%dQK#G32IliC)S*PUQq%#JH*I0yXVIIV}28+>@*dep0qy
zNWInjdzV=I8%#K9u`6cHeu1SwK7|NED*Z&PbG6p1Ie}+#YvE8{+$J9!N_#ZY-yn_j
zw@4$UNg|6z{!fGveNvYRECU~ADxio*WPa%gC&I}`%%l^67kjM2FK;$Wkr*v!yF)|3
za~Wl;J}X1#GByu6J|$!P2#G{y8HJZ*_ZfBNWGA$6Y<|VcXUi3iq&Z3ODu6yvql9G1
zs8QGnZ9vV-ppiT`42u(>C4*LIBq~e3`r9W{{cS*6zEGCMCS;U&D;Yffv;j3YgT}vV
z0l!UqO}+n-Hd~t|I?D`R+JT=0Tgl+(rwyn%88i}qMZ&Crp9EORpuLOhe9*01BBuOD
z+JKs$LGwKG`8A($Mdt4jSxHKe6o@B5Q8L*1X|t(Mf{@6%2|fZ}U;7RrlJFy#Gkg?I
zFcaZ}Da%hH!AGX@`~=Y@lt_l?e%frJ%lfFr<tL#;GWhvvv+<J&P23selh7d<Zq1t~
z?oC{oH_z|Vym@~8`0FPbGu`Pt$DK-jMCY$3Tl?hGYguSO7Kvbzp)Wrz%;M4vvk*ne
z<slgMeaW;cgQ1@`+t8EHClaXy{3Mb|20uS-HdiBjBtQxHNdS`!etz0)@(CXaVFG?1
z-H@Zbe%fsO=nEe@B+v->&Clfu{j}NmN%WKZm@oL{bX`)X7#L_z%~iLbHe1~?2xMLd
zKM72dp=&>FHe(edmCz&LCqYRv`1xtGxmMvLR}J7N0ZB6W`DwH96FxF6gP#N=$>8Uw
z&Bjj#vkYYLdnngf^3!JPkA1wPkhLH9Nq~|J`TVrm_+7|rgWY__)Fj}&4(fCPB|d<;
zwI`|9T|xmRUVtH#)TjzGFtW@unQ}CV9QQcvPZRK44r;xCB8<5KD5+tG@H7G6a8T<7
z6!}b9Nx9Ny+@TZp4ym#w$U{ai_-V8Cf`m}8T-gM?fKmBs;Q7D*up@)L6fJOyHVG(2
zO|y*Z-CuvfY*#eGshF8(qmjI_e?ueh<yMx90jw9W#X+4TU<acL)aC=*5y3G$X}_R0
zIayn7=PW^Wb5=<087w$h7Cd*7DJuB4B2xD`t`ZECDPEI^Clv?O=0<^~#(>(=Ah6UD
z<k}l22}bHA4*#iQ(r+ENV*#Hnb*6yJ8I`ZLsy<b*U($_?4+6VYRZO$?pU#+#E)%~z
z!o)9AU&LSr`8>l^rfaO7UNL`q-So}Nug;rxC{Q<j(&omi^Ck_T%7cab@8bdo`8+jy
zK~JgurY85--w|GYi1wk!<pPWLZMEf*?UIm&N>{9Qy5jq+4Px;XmQti9oGXZd{+&3|
zCs+^*C`z}~sRD{D`r1_zViBP8*XZwjQh>0R-bRfAO2_A5pRrUI1-#=ZqGrFq<*X38
zIu}@MA)xB^3w)77SF>N>-43pHzrbH{aCMGpJnP_U_6z)ygR9*yuxN<tRXGm6mQt9p
z1gvIKOEn2N-ZD5d5&2m)GosBYxK_ZCToBJ^3K$e#SsvL8GJ{<%BYt;R-Q=QfVel~*
z#gKEF#gNGFY07NQ%HEXBg6oBc6wEU1O4dr|jk^UO2q#A-0DihGtVEX-;f0QWLn1nc
zB@5`5%9w0LYX>!3eDsRTk%tVNVI1FQn%}sz^7VKIEt4pGc&!hpTEJx<tI2#wco`SC
z!YO!~fTJC&*&twvgUYzL&ScaijBa&!Hw$>5gIXuxcO6u-fNpi`I5XoUt)!7I@FZ8t
z$qX)aQGOO-M$F)IP^HvnRZ)Aye!+;rpo;~3KZDrM)~}C%`mCQs&0$Q8&z5RwXE5cW
zwlOFiEfsr?QLi{ut##bO2B)Pl0mT9Wm{R@6OH&Ny*L-d$H|ewZ{5YS*i+$SiCx6QF
z5;aOZ?L!e$!%KYj*C)5{&KqxXBp8<5`t#Gvt-PNx{Y!k7hmd}Hc@lXG`(e3#c2NAX
zuK;~bPRH?lK9SFB_$)Vd<f)iEr;{5(rnLJjasR%^<>iLUZJc%~4J7zGIsbk>%j46>
zIQ<ho%eA0fpZ#z74f8{y_xb!0pXD~MpTETKp1{|B6?~Rky?%O$ge%V;nwfu+&*$@5
zuCM*&CCK?3d{15Ot;<buKfT;Hk^2<>dm4UvSyoAiF2(jGuV(n^@kOIPM)7$JpT)dn
z!V*Wr+0IGu$1>)4KFg%%5HfHIpXDa4e`fR3|HrhA$&ljx`wx7CrJAzhZI#N3&vW**
z^zyRt(dYA7>GRd|h$i7bY<#RK`$2m1Y!y_83cVNu<4p`idCG<#rD(?m|6!{_XMay`
zo{hQorm<d(fw2jWI&^q8e8_p5$(w7d!_@uPc`?GLSRKrYAEU14C4;w=jW|jN9`Ry?
zPl>bts5jSE<D7j*)A5I^@XHY|O2alRTlNX{X4(p-JDiO^tRh+Q?1{Wg0Nc)PsOcHD
z;yZ=uXwdV>&%m=s$-wgrmx1S*Dgz(!VQ3jhuZ+F@i9e$_2c9GkAOy`JpGcG>e2q_}
z8pg~1vq<XWMEqLV#;=j}NpjFZgWAf{j~4(>{AE8|qNU?f{97v19&Q6ZTe}Wu`eIdl
z2LK#0oSp$}q2G&Ho;;na)C)Lg<MW!PFIElA=XR!lD+m2MIq<?K!fQt1^Re)Mz%Zu%
zpko3QWg!CETE>gBtzmqh55I!(yBR;uuu(0HKfw4B#($Ra&olmT#$V0&LyQ-Tzl-t3
z-!g<eSEwH_zJ~F_|24+1WxU8kdqz^bIvEeoqK`4~o7wa~H3z;1c<RUSa733E_qr$t
z{Ruhn>vG_8jSC77X~AE8Ro1aaeEpkE{w~lXP)CNp+N$|v`m3ExFZ(VFd)mtoapE%k
zCxw!<viLXJEo>~t(E;|ux;QxenDL1#4d>I)A9M^e{?@ZhI%ZDlFB;DY?+M@F68~$4
z400#uQQY|~Q;V9O1HX{@oN}6BAa=Eq@#|>d<M_Nesg1yofj^NAg=GA;G5x*k4dDk&
ze*xn!Xfg=#m|tXkL#x5dJbwe@A6sGYHz7^OZ5nT2@9+1S{-litE7Ke8VoCaWZjQm9
z%6$HcbHvMl;X*Wo=+~_^2yV8T06eu<<|DE5zapSRB+LAD6!Td<&G31K>z&8=gM4L6
zp%LkzjWDU+O)O`NK`Yt{lkly33}S%sNygvZZxGyVdIuz;A4(ehQB1!V^d$ee!x4?)
z=BckSpNr2lgy%5*eHzaR&*|A5_*a<EMSO|6o!j*X#;?BGFkv%QZv#ID{^!Yk29KmZ
zC^{?2|GhIzI?Q<5c9!Hn!14?I7~qScf9V${S7G3Z&(6;n%+Z!PPt$mza?kTMe>W}A
zD(2I1s==Jg8SRYUn=ptO+}_VHe!=GqUhMoC<d4m=PP<ssGgHs$OU!37P24zSQGYMv
zmtAb|@0*i)DhHq6=D`1*`8<A!VX&L|jLJ89%U@^^a+7Z=<4>7y@DH*+=QI8l9w^&b
z4<|7GcE0QOUyN_kc&^NI>d1lL!F+B#$q<(^pKBREh2^PbUbiv6oawoF>LJE|wAB#q
zVtNWuNp^A!Gx!DLf6n+b+2Nhe__s8k6Q0w@Oz)4KNs%py|G}ssevaiw+$7*Lk20qJ
zfbIWXCYjIp-**~BA@f<o_&@V_U(WPrYJ4XDUZ!8f_IxyxY|TM`MGide>Ph4Ae&2Y!
zpZPy|qG8O_nEC<ZFXgMrXSv>=Gam`7D&zWBOn)7B)Hj%(q94+DS6nZn)jyg36ZZe#
z;|$t6l=Q!g9i+@hlNrB!lOdG(cb>)zSNFV%>DM+GT#W61BjX1*82n*eq!0KA!`{>O
z9Q>~WehlJzx%<bPQa3UG@9_Y=nVH?g_~^+75oLLv$ie@Y#2=cI>p{`;8_egPI>Sfi
z{l7E*jTVC!KV}ygeNH;Y;3>8+9pg3Lz~0{)rvC!-Y39sDjK4o_5VWqP<5b3f?rf93
zobhKf{v@`mZ6;fFGG1<?NC^7@jb|dyX%EvkTxXD6z4{8{5BSFWgE{#8km<+qJbowF
z@e1P~?=y(+G5*aQeC$z1pZ6>^gv+>NCNuse_5-5-dd6SZZRio=Mjy2GDA~E<3HBO;
zR?Uo;n@i=a&o+%`BF||6{EBIv^LoUCRq7ocUkCX*^eE09%)#dhuJ=#e;X;2W<M;A9
zP{t$eNlWrSu-Gs@kJn4TVEi-efQ6p6ASC+Z*nxeQ<@u2Dudg$Rsf-U78u|eHNxHtJ
z<0#<Ck4gMx85eb$-q3h|PhvjB>}Te4Chb<J^)?*wlB-Qj|7DhEIp<!)c!Yq^$Hm+)
z*E9YqUO!8}f1UBp%|Y}B9d~QIfxW-aFnxK0!OHyjE5`fx4H(YB=Ut{h<8y}3JzUA?
zBBRgJIR?SaQxV25*>3Q>tW>oc&xJjw6-=Lhg+a=7D8*2x{vF2*KEq`ajDP0@gP6s5
z+US+&U)*C5;-Bx)cqa0k_T|7I$bo-?`5(r9@@D4s0^?_}0yZ=LFN}ZtJcHorN6{wP
zV?A;RNc+NO!ylCc-;e{}r12s36wAqCsrDT7Tbcek?yt!_06w3C{>B{mdznvOt7#Dr
zV|6eG{m+^HLLQ)(vm*YGgP!)Xrv9D4^K6XiDGGTuy&b9XAvJ~-AoJQ1#(#H>Vel;T
zS;6?HPBn;?jHmF-RB!MSgLs(npJn{7_(DqL`JBcxk>|9R>BFq&BRKC);K~0S9FAxV
zvr`Y|;PVXA7r$=k+5M}Z<e-0p=&`QwUmv}lgPuZNQ+p+hl|1+>0p8T8#dLnEnEs=q
z4eA?~IUmRP02{ve$@PpM$@4pRr)t-Dhphu_W%`9z8B_`L-^KU}USRIx64&P7bEoiO
zdBh$bW&9mU!$AD=4}hnB8TWcbXYjC4&oX^&%FzD^>wY?Z!uWH!T}K+UdQIaw;W@nr
ze1#`1;P_(Ff9CUoF&ZCIm$1UVzy)X*N9wP`XBb2S<7YAcP{bgvVEjVHf1T%3nO{~h
zem(PfiyOR=@xNg|+`xP`X*?%9r|nFC|4xJaE$3au_&Dod>isJ4r04MK5lx9@r#`-+
z@tp9S9@Bh02nJtdJ`FsMgwLOH(EDRWwDzZxDa1rVAU6a&i+3O_S-d}*>Y>;?2}EQ<
zFoWJ`M|W=v0*bXEjs*f03=Ap>Xw+6cf8N3c*@^H~eWELhxa+a=5x6VWe?9^cAXr7T
zZJ?*;d=&Ad5Ui#vl}5@TL@AcT+I1&1HEoEVxPHy?O)E}W6OC5a*UYO|6sHEU{`%uR
zU~XbbpbefN7b%4G>FTC-#oA&~1T+Acm0SJ%np_#wv@G@4FQAw`Su+;q$XH0hezKKV
zI6r&F+!_P~%2r}-O^!O|*5;^VZf&kQ>gtVloDiz5Dwb^R>WZ4UUC`K;P6Rec_QhIt
zR5RA74ueE-7bvu%6MP^VJ#pRAlTV1Aux5EQ3fa>-6|oDJZ&<VR<Q2!K=O9c9LV`u3
zi18CWp^-T>E?=jjC#^nZ+0xa~Q%*ebv=f@5O-q*{h8YME8L>6mnjFyPl_o94TjQW2
z677gWUo<qFw0gy|<D+%eb=7knV~BFZJA_9<v2+@av6PN97O7*pA;twlSczG26e|ie
z#-Rv3q=<Jy5vwwyU?EB}0(r5tCXNP|>W@<pEkqun__!3COotH5kcFaWb;iM(qp3tu
z6S1TC8}Wn=VnUecil&nh7)u6r)5*uvux_G!q3)wlC<vB<P#PJv5I54Q*C;4#qLf7*
zRZKysK$=db^C*xWq=K~Ch7g#gUHZtB<*;NF(9VtOVM??i?iSjwHYXD%##Fj^h}hQV
zlNf;z>kuC+g@7V`t&p>)w+|r(DPk3ChU!dEED{POVj_S<<2qIj3mK1Z;xK#Zl_8W0
z!hfXWJUmU{6pD>!V(l1iI_QaGCZ3Qfh#Z2zEjhV4(ZX~$ISe=gqvq-lh34|A=ZJhR
zCe7^i(ABoe;!~g`Z<In$tF#=(I2|E$Y#*kP5OCcHJ+&#lK12mVs4YZvp*U^nSzMUn
z-{^Qp6wZ!uT}cyyMh^_Ky%Zw+a3HZ%zuUEx<x!_2Tw99{qZ3Ef5>hd}Zr5>%RU%-S
z8Nea~nTW9v9W%`xy0BqgT!#uWa#HLUEfs~{p>QzCzU;k9Ly?%KOT*4gyFdiF>GG9z
zqQ0<pv@+6sg|P@_BGWip(3)W@(eZnXj|CSRhT7jUM?D<c2cToeL4jnTPsbqgwv%Hs
zrHju@yk_a)YohVNk#+PSHlJxD!l~+*PU(EG#7Oo=I}zUoPASdPw8fo~i+GrA=oj)m
zCM1^2qZX4ff*0vvkx>e1C|$|s<i?uP@dhPV8z6iRg<tYU({UNksjl~!34<OR&^k71
z^$rW_Cx7@pL<((PxG=ie1l)70po!U;(P+YSr<G2xF`HmpZaEs$X{y3vOSc>Sr|ZY`
zYpTqw(bGwer-^Rq?cXGt@eKiPxK<tqYcy7<BPK?Z-SK!I(dZC@D9cd=(}nHXMDeN8
zaUMZPmE^EveF)bT=P-tzd^2CVX6kqV=>tNL-0>_EuB-+;{B(UfSfa}ls_TpQw_-UN
zr7@x7k|78bNNF0%5FO-HA&*}4UC%%ddb1^o_`d!@=2n+Qnf@lgSx}^49hVkCvYhaW
zt^{nD)MnT?l}CIx6Vr^e+|lEbra6DtXX*%H(%)@eNoT>}coA{#7#ccs7zLVg+48zE
zTcBNN8LBZ;w@2fSsioVkRw2c4#y)D8(OXl#*+R$0a_gn$Fw?Yv#GZ=GT#VrmM|dwS
zi#x?>pQg~u{k{F^Ok0R39ffwoa@y08kQZx#bR1XC%Pv?T1>MYOiKAIDM>IF);{+5c
z$lwlrqMx;op;^XXDoX%CqD8#73{GS+@S#|=@xlwMHoZ>DN=cn#)M?_1azn|Cq!)Z7
zX8GeZHCrEP{bXCvIi*NIip%N=&PGw$d?IqZH4{iTJue!?Lf5$t@x*<0$!G|1v^2d&
z{&y>6j1TEEkDci;?KLwG8G5O0pwF$1&DI?i=uvY4VjM5%v)H7kGLCMJ<%SOPM_V4E
zOr2Tjf|-m~AhA;QJ6gnr9YmB>gbVM+P|FaD%!{TTu1~Iv(gPlQgfTugo!pr~oR*P*
z_)D+N>0^(wJe(8S@p9ZkoS>BS@@NS~>5sP1`p?apgUa9oGl)IGX+6Z~dR3h~zX!1%
zamI>7pF0IYtjIXRs#hlvy0d!evK3V+JghJ&Y-CeSkvZF{5c0a3Qk^kO5!G$yC&15~
z5j|1oY{oSPg^u(lqR8uycgLszLw(&TRjt)u4Y0bS7fHkf$1mMK0TE_{R7_RJJNb@D
zCt}PyDO1$kIx!^#MhysC-wK&@aZF398n<oq-ID6Ic*{TsC}W8Ztdu#aqcTcXOAAGs
z=M)7)6&P9)5M3Z3bIQCw^*_Z-L;<?hDff5feQ7yQ@-gJwq7HT%AbLb3<>kF<IY%5y
z1IYbNLDvHCmY4Uk<UCyn`Rp&h9QgM?`J`9srwwE2kTXnAA0zlI_v<*R)&RhxCrAG0
zxxAe99*W4P+ppAiDIW$*2i@MZ@h|nud*5=tK{ALKhhKm6c<&RB{PKRdoG+w8I7EJ7
zC+AJb``A-n-XoXuZe7~QPh~`YM&nyidJof;U*0#DbA$z!`lY<Ie;1cu#th}XbU6=m
zhf#UD7w#|rMc}9kzP;_dzkVS<Y?**O@4@UG)MbFY<!|D0a$ZZF<2i7fRv&V{8Yypi
zc^_WRA}<$n<QM#xedXo3shsb<NJ^Ts)GOFqeC6f+dO729e%j%0{~cUj>_OhUkn@X?
zwBo8EJ*n<Nn%Xbr<$ZlQkMmXNORC3w<>kG8Ie*}5u;@+1cm@C-5$Q2RIC&2?_%fwj
zdod|DEqv%bDm>zF$``*J(WJ$nvi$3yCfgG;k?#q@TwW9-_9^vBv#v*3(w|fQ)d*79
zRPR-NQg2eG=aBMpehYct^3MB4!=_NgoSa`NCx}A;2q*Q+^S8(_Q~2tSFj`LU0U(kZ
zLGW41SNib6PR<`AUtllkieZzw$%_Q+r=`y|{WvP}$6@-#wv_z5&!jSGOLLT;$=`uK
zEgi=U{B_O9QT`A=0KCAbaDVwZT;8w0ZvL{fnvBzPNO?IQ%jG>J8oc2sLlKh)SWF=0
zgx-7yASw6!ua$<f*f(JP<tZ&=-z|-IOinSBH<cM|1x~B*FX|BgPyR`4&rq7*16`*4
a$T41QX8%!kw(<{dHRaDRCx?kIss10uN5`!I

diff --git a/src/dsaX_wrangle.c b/src/dsaX_wrangle.c
deleted file mode 100644
index 19507d4..0000000
--- a/src/dsaX_wrangle.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-#include "xgpu.h"
-
-#define N_INTS 128
-
-// global variables
-int DEBUG = 0;
-const int n_all = 3194880;
-
-// to extract autocorrelation data
-void auto_extract(float *output, float *specs);
-
-void auto_extract(float *output, float *specs) {
-
-  int bctr = 0, idx, oidx = 0;
-  for (int a1=0;a1<63;a1++) {
-    for (int a2=0;a2<=a1;a2++) {
-
-      if (a1==a2) {
-	for (int f=0;f<384;f++) {
-	  for (int pol=0;pol<2;pol++) {
-	    idx = 2*((bctr*384+f)*2+pol);
-	    specs[oidx] += output[idx];
-	  }
-	  oidx++;
-	}
-      }
-      bctr++;
-
-    }
-  }
-
-
-}
-
-// for extracting data
-// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
-void simple_extract(Complex *mat, float *output);
-
-void simple_extract(Complex *mat, float *output) {
-
-  int in_idx, out_idx;
-  for (int bctr=0;bctr<2080;bctr++) {
-    for (int pol1=0;pol1<2;pol1++) {
-
-      for (int f=0;f<384;f++) {
-
-	out_idx = 2*((bctr*384+f)*2+pol1);
-	in_idx = (2*f*2080+bctr)*4+pol1*3;
-	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
-	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
-
-      }
-    }
-  }
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int output_specs = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:sdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 's':
-	  output_specs=1;
-	  syslog (LOG_INFO, "Will output spectra files");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block;
-  uint64_t written, block_id;
-  Complex * cblock;
-  float *data = (float *)malloc(sizeof(float)*n_all);
-
-  // spectra outputs
-  FILE *fout, *fmjd;
-  char fnam[100];
-  float *specs = (float *)malloc(sizeof(float)*63*384);
-  float mjd;
-  int ctr = 0;
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    cblock = (Complex *)(block);
-    
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-
-      if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) {
-	syslog(LOG_ERR,"could not open fmjd");
-      }
-      fscanf(fmjd,"%f",&mjd);
-      fclose(fmjd);
-      sprintf(fnam,"/home/ubuntu/data/specs_%f.dat",mjd);
-      
-    }
-
-    // DO STUFF - from block to summed_vis
-
-    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
-    simple_extract((Complex *)(block), data);
-    if (DEBUG) syslog(LOG_DEBUG,"extracted!");
-
-    // write to file if needed
-    if (output_specs==1) {
-
-      if (ctr==0) 
-	for (int i=0;i<63*384;i++) specs[i] = 0.;
-
-      auto_extract(data, specs);
-      ctr += 1;
-
-      if (ctr==N_INTS) {
-	fout = fopen(fnam,"a");
-	for (int i=0;i<63*384;i++) 
-	  fprintf(fout, "%f\n", specs[i]);
-	fclose(fout);
-	ctr=0;
-      }
-	
-    }
-    
-    
-    // write to output
-    written = ipcio_write (hdu_out->data_block, (char *)data, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);
-      for (int i=0;i<10;i++) {
-	syslog(LOG_INFO, "%g", data[i]);
-	printf("%g ", data[i]);
-	printf("\n");
-      }
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(data);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_wrangleAndWrite.c b/src/dsaX_wrangleAndWrite.c
deleted file mode 100644
index 6cd4a33..0000000
--- a/src/dsaX_wrangleAndWrite.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-#include "xgpu.h"
-
-// global variables
-int DEBUG = 0;
-const int n_all = 3194880;
-const int nbl = 2080;
-
-// for lookup table generation
-// index is position to extract from xgpu array to output (Greg-style) array
-void gen_lookup(int * idx_xgpu_in_greg);
-void gen_lookup(int * idx_xgpu_in_greg) {
-
-  // get antenna order in xgpu
-  int xgpu_ant_1[nbl], xgpu_ant_2[nbl], ct=0;
-  for (int i=0;i<64;i++) {
-    for (int j=0;j<=i;j++) {
-      xgpu_ant_1[ct] = j;
-      xgpu_ant_2[ct] = i;
-      ct++;
-    }
-  }
-
-  // get antenna order in Greg
-  int gh_ant_1[nbl], gh_ant_2[nbl];
-  ct=0;
-  for (int i=0;i<64;i++) {
-    for (int j=i;j<64;j++) {
-      gh_ant_1[ct] = i;
-      gh_ant_2[ct] = j;
-      ct++;
-    }
-  }
-
-  // match antenna orders
-  for (int i=0;i<nbl;i++) {
-
-    for (int j=0;j<nbl;j++) {
-      if (gh_ant_1[i]==xgpu_ant_1[j] && gh_ant_2[i]==xgpu_ant_2[j])
-	idx_xgpu_in_greg[i] = j;
-    }
-
-  }
-
-}
-
-
-// for reordering correlations
-void reorder_gh(float *input, float *output);
-void reorder_gh(float *input, float *output, int * idx_xgpu_in_greg) {
-
-  for (int i=0;i<nbl;i++) {
-    for (int j=0;j<384*2*2;j++) {
-
-      output[i*1536+j] = input[idx_xgpu_in_greg[i]*1536+j];
-
-    }
-  }
-    
-}
-
-// for extracting data
-// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
-void simple_extract(Complex *mat, float *output);
-
-void simple_extract(Complex *mat, float *output) {
-
-  int in_idx, out_idx;
-  for (int bctr=0;bctr<2080;bctr++) {
-    for (int pol1=0;pol1<2;pol1++) {
-
-      for (int f=0;f<384;f++) {
-
-	out_idx = 2*((bctr*384+f)*2+pol1);
-	in_idx = (2*f*2080+bctr)*4+pol1*3;
-	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
-	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
-
-      }
-    }
-  }
-
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_wrangle", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = TEST_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block;
-  uint64_t written, block_id;
-  Complex * cblock;
-  float *data = (float *)malloc(sizeof(float)*n_all);
-  
-  
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    cblock = (Complex *)(block);
-    
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF - from block to summed_vis
-
-    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
-    simple_extract((Complex *)(block), data);
-    if (DEBUG) syslog(LOG_DEBUG,"extracted!");    
-
-    // write to output
-    written = ipcio_write (hdu_out->data_block, (char *)data, block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);
-      for (int i=0;i<10;i++) {
-	syslog(LOG_INFO, "%g", data[i]);
-	printf("%g ", data[i]);
-	printf("\n");
-      }
-    }
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(data);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dsaX_writeFil.c b/src/dsaX_writeFil.c
deleted file mode 100644
index 751db9d..0000000
--- a/src/dsaX_writeFil.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/* This works pretty much like the trigger code. receives a control UDP message 
-to store some data for a fixed amount of time.
-Message format: length(s)-NAME
-Will ignore messages until data recording is over
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <arpa/inet.h>
-#include <sys/syscall.h>
-#include <syslog.h>
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <src/sigproc.h>
-#include <src/header.h>
-
-
-FILE *output;
-
-void send_string(char *string) /* includefile */
-{
-  int len;
-  len=strlen(string);
-  fwrite(&len, sizeof(int), 1, output);
-  fwrite(string, sizeof(char), len, output);
-}
-
-void send_float(char *name,float floating_point) /* includefile */
-{
-  send_string(name);
-  fwrite(&floating_point,sizeof(float),1,output);
-}
-
-void send_double (char *name, double double_precision) /* includefile */
-{
-  send_string(name);
-  fwrite(&double_precision,sizeof(double),1,output);
-}
-
-void send_int(char *name, int integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(int),1,output);
-}
-
-void send_char(char *name, char integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(char),1,output);
-}
-
-
-void send_long(char *name, long integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(long),1,output);
-}
-
-void send_coords(double raj, double dej, double az, double za) /*includefile*/
-{
-  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
-  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
-  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
-  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
-}
-
-
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-int trignum = 0;
-int dumpnum = 0;
-char iP[100];
-char srcnam[1024];
-float reclen;
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-void convert_block(char * b1, char * b2);
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_image [options]\n"
-	   " -c core   bind process to CPU core\n"
-	   " -b write one beam\n"
-	   " -f filename base [default test.fil]\n"
-	   " -k in_key [BF_BLOCK_KEY]\n"
-	   " -i IP to listen to [no default]\n"
-	   " -s integrate N ints MUST BE FACTOR OF 16384 [default 1]\n"
-	   " -m get mjd from file\n"
-	   " -d DEBUG\n"
-	   " -h        print usage\n");
-}
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in) {
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-}
-
-// Thread to control the dumping of data
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = WRITEVIS_CONTROL_PORT;
-  char sport[10];
-  sprintf(sport,"%d",port);
-  
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  float tmp_reclen;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-    trignum++;
-
-    // interpret buffer string
-    char * rest = buffer;
-    tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr));
-    char * tmp_srcnam = strtok(NULL, "-");
-    
-    if (!dump_pending) {
-      reclen = tmp_reclen;
-      strcpy(srcnam,tmp_srcnam);
-      syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam);
-    }
-	
-    if (dump_pending)
-      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam);
-  
-    if (!dump_pending) dump_pending = 1;
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  if (ctx->verbose)
-    syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_writeFil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA defs */
-  dada_hdu_t* hdu_in = 0;
-  multilog_t* log = 0;
-  key_t in_key = BF_BLOCK_KEY;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // command line
-  int arg = 0;
-  int core = -1;
-  float fch1 = 1530.0;
-  char fnam[300], foutnam[400];
-  sprintf(fnam,"/home/dsa/alltest");
-
-  // for getting MJD
-  FILE *fmjd;
-  int get_mjd = 0;
-  int sumi=1;
-  int onebeam=0;
-  
-  while ((arg=getopt(argc,argv,"c:f:o:i:k:s:bmdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      printf ("ERROR: -c flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'k':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-k flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  strcpy(fnam,optarg);
-	  break;
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'd':
-	  DEBUG=1;
-	  break;
-	case 'b':
-	  onebeam=1;
-	  break;
-	case 'm':
-	  get_mjd=1;
-	  break;
-	case 's':
-	  sumi = atoi(optarg);
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // DADA stuff
-  
-  udpdb.verbose = 1;
-
-  syslog (LOG_INFO, "dsaX_writefil: creating hdu");
-
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"dsaX_writefil: could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"dsaX_writespec: could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      syslog(LOG_INFO,"binding to core %d", core);
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"dsaX_writefil: failed to bind to core %d", core);
-    }
-
-  int observation_complete=0;
-
-  // more DADA stuff - deal with headers
-  
-  uint64_t header_size = 0;
-
-  // read the headers from the input HDUs and mark as cleared
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "main: could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  syslog(LOG_INFO, "starting control_thread()");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-
-  // set up
-  int fctr = 0, integration = 0;
-  char tstamp[100];
-  double mjd=55000.;
-  int rownum = 1;
-  int dfwrite = 0;
-  float mytsamp = 4.*8.*8.192e-6;
-  int NINTS, midx;
-  
-  // data stuff
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t bytes_read = 0, block_id;
-  char *block;
-  float *hoblock = (float *)malloc(sizeof(float)*64*1024*16384/sumi);  
-  
-  // start things
-
-  syslog(LOG_INFO, "dsaX_writespec: starting observation");
-  int nblocks = 0;
-  
-  while (!observation_complete) {
-
-    // read block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    if (DEBUG) for (int i=0;i<48;i++) syslog(LOG_INFO,"%hu",((unsigned char *)(block))[i]);
-
-    for (int i=0;i<64*1024*16384/sumi;i++) hoblock[i] = 0.;
-    
-    // for writing sum
-    /*    for (int i=0;i<256*48;i++) oblock[i] = 0.;
-    for (int i=0;i<128;i++) {
-      for (int j=0;j<256*48;j++) oblock[j] += (float)(block[i*256*48+j]);
-      }*/
-    
-    syslog(LOG_INFO,"read block %d",nblocks);
-        
-    // check for dump_pending
-    if (dump_pending) {
-
-      // if file writing hasn't started
-      if (dfwrite==0) {
-
-	syslog(LOG_INFO, "beginning file write for SRC %s for %f s",srcnam,reclen);
-	
-	NINTS = (int)(floor(reclen/(mytsamp*16384.)));
-	//NINTS = (int)(floor(reclen/(0.134217728)));
-	sprintf(foutnam,"%s_%s_%d_%d.fil",fnam,srcnam,fctr,nblocks);
-	syslog(LOG_INFO, "main: opening new file %s",foutnam);
-
-	if (!(output = fopen(foutnam,"wb"))) {
-	  printf("Couldn't open output file\n");
-	  return 0;	  
-	}
-
-	if (get_mjd==1) {
-	  if (!(fmjd = fopen("/home/ubuntu/tmp/mjd.dat","r"))) {
-	    syslog(LOG_ERR,"could not open fmjd");
-	  }
-	  fscanf(fmjd,"%lf",&mjd);
-	  mjd += nblocks*4.294967296/86400.;
-	  fclose(fmjd);
-	}
-	  
-
-	send_string("HEADER_START");
-	send_string("source_name");
-	send_string(srcnam);
-	send_int("machine_id",1);
-	send_int("telescope_id",82);
-	send_int("data_type",1); // filterbank data
-	send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
-	send_double("foff",-0.244140625);
-	send_int("nchans",1024);
-	if (sumi==1) send_int("nbits",8);
-	else send_int("nbits",32);	
-	send_double("tstart",mjd);
-	send_double("tsamp",8.192e-6*8.*4.*sumi);
-	send_int("nifs",1);
-	send_string("HEADER_END");
-	
-	syslog(LOG_INFO, "main: opened new file %s",foutnam);
-		
-	dfwrite=1;
-
-	
-      }      
-      
-      // write data to file
-      syslog(LOG_INFO,"writing");
-
-      
-      for (int i=0;i<64;i++) {
-	for (int j=0;j<16384/sumi;j++) {
-	  for (int k=0;k<sumi;k++) {
-	    for (int l=0;l<1024;l++) {
-	      hoblock[i*16384*1024/sumi + j*1024 + l] += 1.*((unsigned char *)(block))[i*16384*1024 + (j*sumi+k)*1024 + l];
-	    }
-	  }
-	}
-      }
-	      
-      
-      if (sumi==1) fwrite((unsigned char *)(block),sizeof(unsigned char),block_size,output);
-      else {
-	if (onebeam==1) fwrite(hoblock + block_size/sumi/2,sizeof(float),block_size/sumi/64,output);
-	else fwrite(hoblock,sizeof(float),block_size/sumi,output);
-      }
-      //fwrite(oblock,sizeof(float),256*48,output);
-
-      integration++;
-      // check if file writing is done
-      if (integration==NINTS) {
-	fclose(output);
-	integration=0;
-	syslog(LOG_INFO, "dsaX_writespec: completed file %d",fctr);
-	fctr++;
-	dfwrite=0;
-	dump_pending=0;
-      }
-
-      syslog(LOG_INFO,"written");
-      
-    }
-            
-    // close off loop
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    nblocks += 1;
-    
-  }
-
-  // close control thread
-  syslog(LOG_INFO, "joining control_thread");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-
-  free(hoblock);
-  dsaX_dbgpu_cleanup(hdu_in);
- 
-}
diff --git a/src/dsaX_writevis.c b/src/dsaX_writevis.c
deleted file mode 100644
index 02cebb7..0000000
--- a/src/dsaX_writevis.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/* This works pretty much like the trigger code. receives a control UDP message 
-to store some data for a fixed amount of time.
-Message format: length(s)-NAME
-Will ignore messages until data recording is over
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <arpa/inet.h>
-#include <sys/syscall.h>
-#include <syslog.h>
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-#include "fitsio.h"
-#include "xgpu.h"
-
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-int trignum = 0;
-int dumpnum = 0;
-char iP[100];
-char srcnam[1024];
-float reclen;
-int DEBUG = 0;
-
-// assumes that only first 78 baselines are written and 384 channels and 2 pols
-const int n = 9216;
-float summed_vis[9216];
-const int n_all = 3194880;
-
-// for extracting data
-// assumes TRIANGULAR_ORDER for mat (f, baseline, pol, ri)
-void simple_extract(Complex *mat, float *output);
-
-void simple_extract(Complex *mat, float *output) {
-
-  int in_idx, out_idx;
-  for (int bctr=0;bctr<2080;bctr++) {
-    for (int pol1=0;pol1<2;pol1++) {
-
-      for (int f=0;f<384;f++) {
-
-	out_idx = 2*((bctr*384+f)*2+pol1);
-	in_idx = (2*f*2080+bctr)*4+pol1*3;
-	output[out_idx] = 0.5*(mat[in_idx].real + mat[in_idx+8320].real);
-	output[out_idx+1] = 0.5*(mat[in_idx].imag + mat[in_idx+8320].imag);
-
-      }
-    }
-  }
-
-}
-
-
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_image [options]\n"
-	   " -c core   bind process to CPU core\n"
-	   " -d debug [default no]\n"
-	   " -k in_key [default XGPU_BLOCK_KEY]\n"
-	   " -f filename base [default test.fits]\n"
-	   " -o freq of chan 1 [default 1494.84375]\n"
-	   " -i IP to listen to [no default]\n"
-	   " -h        print usage\n");
-}
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in) {
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-}
-
-// Thread to control the dumping of data
-
-void control_thread (void * arg) {
-
-  udpdb_t * ctx = (udpdb_t *) arg;
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // port on which to listen for control commands
-  int port = WRITEVIS_CONTROL_PORT;
-  char sport[10];
-  sprintf(sport,"%d",port);
-  
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,sport,&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  float tmp_reclen;
-  
-  syslog(LOG_INFO, "control_thread: created socket on port %d", port);
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-    trignum++;
-
-    // interpret buffer string
-    char * rest = buffer;
-    tmp_reclen = (float)(strtof(strtok(rest, "-"),&endptr));
-    char * tmp_srcnam = strtok(NULL, "-");
-    
-    if (!dump_pending) {
-      reclen = tmp_reclen;
-      strcpy(srcnam,tmp_srcnam);
-      syslog(LOG_INFO, "control_thread: received command to dump %f s for SRC %s",reclen,srcnam);
-    }
-	
-    if (dump_pending)
-      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump %f s for SRC %s",tmp_reclen,tmp_srcnam);
-  
-    if (!dump_pending) dump_pending = 1;
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-
-  if (ctx->verbose)
-    syslog(LOG_INFO, "control_thread: exiting");
-
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-
-}
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_writevis", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA defs */
-  dada_hdu_t* hdu_in = 0;
-  multilog_t* log = 0;
-  key_t in_key = XGPU_BLOCK_KEY;
-
-  /* actual struct with info */
-  udpdb_t udpdb;
-  
-  // command line
-  int arg = 0;
-  int core = -1;
-  float fch1 = 1500.0;
-  int nchans = 384;
-  char fnam[300], foutnam[400];
-  sprintf(fnam,"/home/ubuntu/alltest");
-  
-  while ((arg=getopt(argc,argv,"c:f:o:i:k:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      printf ("ERROR: -c flag requires argument\n");
-	      return EXIT_FAILURE;
-	    }
-	case 'k':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-k flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  strcpy(fnam,optarg);
-	  break;
-	case 'd':
-	  DEBUG=1;
-	  break;
-	case 'o':
-	  fch1 = atof(optarg);
-	  break;
-	case 'i':
-	  strcpy(iP,optarg);
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // DADA stuff
-  
-  udpdb.verbose = 1;
-
-  syslog (LOG_INFO, "dsaX_writevis: creating hdu");
-
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"dsaX_writevis: could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"dsaX_writevis: could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      syslog(LOG_INFO,"binding to core %d", core);
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"dsaX_writevis: failed to bind to core %d", core);
-    }
-
-  int observation_complete=0;
-
-  // more DADA stuff - deal with headers
-  
-  uint64_t header_size = 0;
-
-  // read the headers from the input HDUs and mark as cleared
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "main: could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-
-  // start control thread
-  int rval = 0;
-  pthread_t control_thread_id;
-  syslog(LOG_INFO, "starting control_thread()");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_INFO, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-  }
-
-  // set up
-  int fctr = 0, integration = 0;
-  fitsfile *fptr;
-  int rownum = 1;
-  int fwrite = 0;
-  int status=0;
-  float mytsamp = 4096*4*8.192e-6;
-  int NINTS;
-  
-  // data stuff
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t bytes_read = 0, block_id;
-  char *block;
-  float *data = (float *)malloc(sizeof(float)*n_all);
-  int si1, si2;
-  int nblocks = 0;
-  Complex * cblock; 
-  
-  // start things
-
-  syslog(LOG_INFO, "dsaX_writevis: starting observation");
-
-  while (!observation_complete) {
-
-    // read block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    cblock = (Complex *)(block);
-
-    if (DEBUG) {
-      if (nblocks==20) {
-	for (int i=100;i<200;i++) {
-	  syslog(LOG_DEBUG,"MAT %d %f %f",i,(float)(cblock[i].real),(float)(cblock[i].imag));
-	}
-      }
-    }
-    
-    // DO STUFF - from block to summed_vis
-
-    if (DEBUG) syslog(LOG_DEBUG,"extracting...");
-    simple_extract((Complex *)(block), data);
-    for (int i=0;i<n;i++) summed_vis[i] = data[i];
-    if (DEBUG) syslog(LOG_DEBUG,"extracted!");
-    
-    // check for dump_pending
-    if (dump_pending) {
-
-      // if file writing hasn't started
-      if (fwrite==0) {
-
-	syslog(LOG_INFO, "dsaX_writevis: beginning file write for SRC %s for %f s",srcnam,reclen);
-	status=0;
-	
-	NINTS = (int)(floor(reclen/mytsamp));
-	sprintf(foutnam,"%s_%s_%d.fits",fnam,srcnam,fctr);
-	syslog(LOG_INFO, "main: opening new file %s",foutnam);
-	rownum=1;
-	
-	char *ttype[] = {"VIS"};
-	char *tform[] = {"9216E"}; // assumes classic npts
-	char *tunit[] = {"\0"};
-	char *wsrcnam = srcnam;
-	
-	char extname[] = "DATA";
-	fits_create_file(&fptr, foutnam, &status);
-	if (status) syslog(LOG_ERR, "create_file FITS error %d",status);
-	fits_create_tbl(fptr, BINARY_TBL, 0, 1, ttype, tform, tunit, extname, &status);
-	fits_write_key(fptr, TFLOAT, "TSAMP", &mytsamp, "Sample time (s)", &status);
-	fits_write_key(fptr, TFLOAT, "FCH1", &fch1, "Frequency (MHz)", &status);
-	fits_write_key(fptr, TINT, "NCHAN", &nchans, "Channels", &status);
-	fits_write_key(fptr, TSTRING, "Source", &wsrcnam[0], "Source", &status);	  
-	fits_write_key(fptr, TINT, "NBLOCKS", &nblocks, "Ints", &status);
-	if (status) syslog(LOG_ERR, "fits_write FITS error %d",status);
-	fits_close_file(fptr, &status);
-
-	fwrite=1;
-	
-      }
-
-      // write data to file
-      fits_open_table(&fptr, foutnam, READWRITE, &status);
-      fits_write_col(fptr, TFLOAT, 1, rownum, 1, n, summed_vis, &status);
-      rownum += 1;
-      fits_update_key(fptr, TINT, "NAXIS2", &rownum, "", &status);
-      fits_close_file(fptr, &status);
-      integration++;
-      if (status) syslog(LOG_ERR, "fits_write FITS error %d",status);	
-      // check if file writing is done
-      if (integration==NINTS) {
-	integration=0;
-	syslog(LOG_INFO, "dsaX_writevis: completed file %d",fctr);
-	fctr++;
-	fwrite=0;
-	dump_pending=0;
-      }
-
-      syslog(LOG_INFO,"written");
-      
-    }
-            
-    // close off loop
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    nblocks++;
-
-    if (DEBUG) syslog(LOG_DEBUG,"Finished block %d",nblocks);
-    
-  }
-
-  // close control thread
-  syslog(LOG_INFO, "joining control_thread");
-  quit_threads = 1;
-  void* result=0;
-  pthread_join (control_thread_id, &result);
-
-  free(data);
-  dsaX_dbgpu_cleanup(hdu_in);
- 
-}
diff --git a/src/dsaX_xgpu.cu b/src/dsaX_xgpu.cu
deleted file mode 100644
index d065848..0000000
--- a/src/dsaX_xgpu.cu
+++ /dev/null
@@ -1,375 +0,0 @@
-// -*- c++ -*-
-/* will run xgpu */
-/* assumes input block size is appropriate */
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
-
-#include <thrust/fill.h>
-#include <thrust/device_vector.h>
-#include <thrust/sequence.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/scatter.h>
-
-//#include "dada_cuda.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include "cube/cube.h"
-#include "xgpu.h"
-
-/* global variables */
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-} 
-
-// kernel for fluffing
-// run with 6291456 blocks of 32 threads
-__global__ void promoter(char *input, char *output) {
-
-  int idx = blockIdx.x*32 + threadIdx.x;
-  char v = input[idx];
-  
-  //output[2*idx] = ((v<<4) & 240) >> 4;
-  //output[2*idx+1] = v >> 4;
-  output[2*idx] = (char)(((unsigned char)(v) & (unsigned char)(15)) << 4) >> 4;
-  output[2*idx+1] = (char)(((unsigned char)(v) & (unsigned char)(240))) >> 4;
-  
-}
-
-void usage()
-{
-fprintf (stdout,
-	 "dsaX_xgpu [options]\n"
-	 " -c core   bind process to CPU core [no default]\n"
-	 " -d send debug messages to syslog\n"
-	 " -i in_key [default REORDER_BLOCK_KEY]\n"
-	 " -o out_key [default XGPU_BLOCK_KEY]\n"
-	 " -h print usage\n");
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_xgpu", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY;
-  key_t out_key = XGPU_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }  
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out);  
-  uint64_t  bytes_read = 0;
-  char * block;
-  char * output_buffer;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  uint64_t written, block_id;  
-
-  
-  // set up xgpu
-
-  // register input hdu with gpu
-  //dada_cuda_dbregister(hdu_in);
-
-  // structures and definitions
-  XGPUInfo xgpu_info;
-  int syncOp = SYNCOP_DUMP;
-  int xgpu_error = 0;
-  xgpuInfo(&xgpu_info);
-  XGPUContext context;
-  context.array_h = NULL;
-  context.matrix_h = NULL;
-  xgpu_error = xgpuInit(&context, 0);
-  if(xgpu_error) {
-    syslog(LOG_ERR, "xGPU error %d", xgpu_error);
-    dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-    return EXIT_FAILURE;
-  }
-  ComplexInput *array_h = context.array_h; // this is pinned memory
-  Complex *cuda_matrix_h = context.matrix_h;
-  memset((char *)array_h,0,2*context.array_len);
-
-  syslog(LOG_INFO,"Set up xgpu with input size %d output size %d",context.array_len,context.matrix_len);
-
-  // set up data input for fluffing
-  char * h_din = (char *)malloc(sizeof(char)*context.array_len);
-  char *d_din, *d_dout;
-  cudaMalloc((void **)&d_din, context.array_len*sizeof(char));
-  cudaMalloc((void **)&d_dout, 2*context.array_len*sizeof(char)); 
-
-  // do prestart
-  syslog(LOG_INFO, "pre-starting...");
-  char * tmp_data = (char *)malloc(sizeof(char)*context.array_len);
-  memset(tmp_data, 1, context.array_len);
-  for (int i=0;i<10;i++) {
-
-    cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice);
-    promoter<<<6291456,32>>>(d_din,d_dout);
-    //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
-    xgpu_error = xgpuCudaXengine(&context, syncOp);
-    xgpuClearDeviceIntegrationBuffer(&context);
-
-  }
-
-  free(tmp_data);
-  syslog(LOG_INFO, "finished with pre-start");
-  
-  // get things started
-  bool observation_complete=0;
-  bool started = 0;
-  syslog(LOG_INFO, "starting observation");
-  int blocks = 0;
-  
-  while (!observation_complete) {
-
-    if (DEBUG) syslog(LOG_DEBUG,"reading block");    
-    
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-      
-    // DO STUFF
-
-    for (int myint=0;myint<NPACKETS/NPACKETS_INTS;myint++) {
-    
-      // do fluff
-      cudaMemcpy(d_din,block+myint*block_size*NPACKETS_INTS/NPACKETS,context.array_len*sizeof(char),cudaMemcpyHostToDevice);
-      promoter<<<6291456,32>>>(d_din,d_dout);
-      //cudaMemcpy((char *)(array_h),d_dout,2*context.array_len*sizeof(char),cudaMemcpyDeviceToHost);        
-      cudaDeviceSynchronize();
-    
-      // run xgpu
-      //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
-      xgpu_error = xgpuCudaXengine(&context, syncOp);
-      if(xgpu_error) {
-	syslog(LOG_ERR, "xGPU error %d\n", xgpu_error);
-	return EXIT_FAILURE;
-      }
-      
-      if (started==0 && blocks==20) {
-	syslog(LOG_INFO,"now in RUN state");
-	if (DEBUG) {
-	  for (int i=100;i<200;i++) {
-	    syslog(LOG_DEBUG,"INPUT %hhi %hhi",array_h[i].real,array_h[i].imag);
-	    syslog(LOG_DEBUG,"OUTPUT %g %g",(float)(cuda_matrix_h[i].real),(float)(cuda_matrix_h[i].imag));
-	  }
-	}
-	started=1;
-      }    
-      
-      // clear device
-      xgpuClearDeviceIntegrationBuffer(&context);
-      
-      // write to output
-      
-      written = ipcio_write (hdu_out->data_block, (char *)(cuda_matrix_h), block_out);
-      if (written < block_out)
-	{
-	  syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	  return EXIT_FAILURE;
-	}
-
-      if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);	    
-      blocks++;
-
-    }
-      
-    // finish up
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-  }
-
-  // finish up
-  free(output_buffer);
-  free(h_din);
-  cudaFree(d_din);
-  cudaFree(d_dout);
-  //dada_cuda_dbunregister(hdu_in);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-
-
diff --git a/src/dumpfil.c b/src/dumpfil.c
deleted file mode 100644
index 0be913c..0000000
--- a/src/dumpfil.c
+++ /dev/null
@@ -1,294 +0,0 @@
-//E_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-
-// global variables
-int DEBUG = 0;
-
-void usage()
-{
-  fprintf (stdout,
-	   "dumpfil [options]\n"
-	   " -d send debug messages to syslog\n"
-	   " -p no header\n"
-	   " -f file to dump to [default none]\n"
-	   " -n blocks to dump [default 30]\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -g ignore first block\n"
-	   " -h print usage\n");
-}
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in);
-
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-  
-}
-
-FILE *output;
-
-void send_string(char *string) /* includefile */
-{
-  int len;
-  len=strlen(string);
-  fwrite(&len, sizeof(int), 1, output);
-  fwrite(string, sizeof(char), len, output);
-}
-
-void send_float(char *name,float floating_point) /* includefile */
-{
-  send_string(name);
-  fwrite(&floating_point,sizeof(float),1,output);
-}
-
-void send_double (char *name, double double_precision) /* includefile */
-{
-  send_string(name);
-  fwrite(&double_precision,sizeof(double),1,output);
-}
-
-void send_int(char *name, int integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(int),1,output);
-}
-
-void send_char(char *name, char integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(char),1,output);
-}
-
-
-void send_long(char *name, long integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(long),1,output);
-}
-
-void send_coords(double raj, double dej, double az, double za) /*includefile*/
-{
-  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
-  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
-  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
-  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
-}
-
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dumpfil", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-
-  // data block HDU keys
-  key_t in_key = 0x0000aaae;
-  
-  // command line arguments
-  char fnam[100];
-  sprintf(fnam,"/home/ubuntu/dumpfil.fil");
-  int nbl = 30;
-  int arg = 0;
-  int nhd = 0;
-  int igblock = 0;
-  
-  while ((arg=getopt(argc,argv,"f:i:n:pdgh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'n':
-	  if (optarg)
-	    {
-	      nbl = atoi(optarg);	      
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-n flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'p':
-	  nhd=1;
-	  syslog (LOG_INFO, "Will not write a header");
-	  break;
-	case 'g':
-	  igblock=1;
-	  syslog (LOG_INFO, "Will ignore first block");
-	  break;
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  syslog(LOG_INFO,"will use %d blocks",nbl);
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in);
-      return EXIT_FAILURE;
-    }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input block size %lu\n",block_size);
-  uint64_t  bytes_read = 0;
-  uint64_t npackets = 1;
-  char * block, * output_buffer;
-  uint64_t written, block_id;
-
-  // fill output buffer if file exists
-  output=fopen(fnam,"wb");
-  if(output == NULL)
-    {
-      syslog(LOG_ERR,"Error opening file");
-      exit(1);
-    }
-
-  if (!nhd) {
-    send_string("HEADER_START");
-    send_string("source_name");
-    send_string("TESTSRC");
-    send_int("machine_id",1);
-    send_int("telescope_id",82);
-    send_int("data_type",1); // filterbank data
-    send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
-    send_double("foff",-0.244140625);
-    send_int("nchans",1024);
-    send_int("nbits",8);
-    send_double("tstart",55000.0);
-    send_double("tsamp",8.192e-6*8.*16.);
-    send_int("nifs",1);
-    send_string("HEADER_END");
-  }
-  
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-
-
-  while (blocks < nbl) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (!igblock || started!=0) {
-      fwrite(block, sizeof(char), bytes_read, output);
-      blocks++;
-    }
-
-    if (started==0) started=1;
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-  }
-
-  fclose(output);
-  dsaX_dbgpu_cleanup (hdu_in);
-  
-}
diff --git a/src/fil2dada.c b/src/fil2dada.c
deleted file mode 100644
index c49f2b5..0000000
--- a/src/fil2dada.c
+++ /dev/null
@@ -1,521 +0,0 @@
-//E_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-//#include "ascii_header.h"
-//#include "dsaX_capture.h"
-//#include "dsaX_def.h"
-
-// global variables
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
-/* read fil file header variables */
-char rawdatafile[80], source_name[80];
-int machine_id, telescope_id, data_type, nchans, nbits, nifs, scan_number,
-  barycentric,pulsarcentric; /* these two added Aug 20, 2004 DRL */
-double tstart,mjdobs,tsamp,fch1,foff,refdm,az_start,za_start,src_raj,src_dej;
-double gal_l,gal_b,header_tobs,raw_fch1,raw_foff;
-int nbeams, ibeam;
-/* added 20 December 2000    JMC */
-double srcl,srcb;
-double ast0, lst0;
-long wapp_scan_number;
-char project[8];
-char culprits[24];
-double analog_power[2];
-/* added frequency table for use with non-contiguous data */
-double frequency_table[4096]; /* note limited number of channels */
-long int npuls; /* added for binary pulse profile format */
-
-
-int nbins;
-double period;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
-{
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
-  dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-  
-}
-
-/*
-void get_string(FILE *inputfile, int *nbytes, char string[])
-{
-  int nchar;
-  size_t nRead;
-  strcpy(string,"ERROR");
-  nRead = fread(&nchar, sizeof(int), 1, inputfile);
-  if (feof(inputfile)) exit(0);
-  if (nchar>80 || nchar<1) return;
-  *nbytes=sizeof(int);
-  nRead = fread(string, nchar, 1, inputfile);
-  string[nchar]='\0';
-  *nbytes+=nchar;
-}
-*/
-
-int read_header(FILE *inputfile);
-/*
-int read_header(FILE *inputfile)
-{
-  size_t nRead;
-  char string[80], message[80];
-  int itmp,nbytes,totalbytes,expecting_rawdatafile=0,expecting_source_name=0; 
-  int expecting_frequency_table=0,channel_index;
-
-
-
-  get_string(inputfile,&nbytes,string);
-  if (!strcmp(string,"HEADER_START")) 
-	rewind(inputfile);
-	return 0;
-  }
-  totalbytes=nbytes;
-
-  while (1) {
-    get_string(inputfile,&nbytes,string);
-    if (strcmp(string,"HEADER_END")) break;
-    totalbytes+=nbytes;
-    if (strcmp(string,"rawdatafile")) {
-      expecting_rawdatafile=1;
-    } else if (strcmp(string,"source_name")) {
-      expecting_source_name=1;
-    } else if (strcmp(string,"FREQUENCY_START")) {
-      expecting_frequency_table=1;
-      channel_index=0;
-    } else if (strcmp(string,"FREQUENCY_END")) {
-      expecting_frequency_table=0;
-    } else if (strcmp(string,"az_start")) {
-      nRead = fread(&az_start,sizeof(az_start),1,inputfile);
-      totalbytes+=sizeof(az_start);
-    } else if (strcmp(string,"za_start")) {
-      nRead = fread(&za_start,sizeof(za_start),1,inputfile);
-      totalbytes+=sizeof(za_start);
-    } else if (strcmp(string,"src_raj")) {
-      nRead = fread(&src_raj,sizeof(src_raj),1,inputfile);
-      totalbytes+=sizeof(src_raj);
-    } else if (strcmp(string,"src_dej")) {
-      nRead = fread(&src_dej,sizeof(src_dej),1,inputfile);
-      totalbytes+=sizeof(src_dej);
-    } else if (strcmp(string,"tstart")) {
-      nRead = fread(&tstart,sizeof(tstart),1,inputfile);
-      totalbytes+=sizeof(tstart);
-    } else if (strcmp(string,"tsamp")) {
-      nRead = fread(&tsamp,sizeof(tsamp),1,inputfile);
-      totalbytes+=sizeof(tsamp);
-    } else if (strcmp(string,"period")) {
-      nRead = fread(&period,sizeof(period),1,inputfile);
-      totalbytes+=sizeof(period);
-    } else if (strcmp(string,"fch1")) {
-      nRead = fread(&fch1,sizeof(fch1),1,inputfile);
-      totalbytes+=sizeof(fch1);
-    } else if (strcmp(string,"fchannel")) {
-      nRead = fread(&frequency_table[channel_index++],sizeof(double),1,inputfile);
-      totalbytes+=sizeof(double);
-      fch1=foff=0.0;
-    } else if (strcmp(string,"foff")) {
-      nRead = fread(&foff,sizeof(foff),1,inputfile);
-      totalbytes+=sizeof(foff);
-    } else if (strcmp(string,"nchans")) {
-      nRead = fread(&nchans,sizeof(nchans),1,inputfile);
-      totalbytes+=sizeof(nchans);
-    } else if (strcmp(string,"telescope_id")) {
-      nRead = fread(&telescope_id,sizeof(telescope_id),1,inputfile);
-      totalbytes+=sizeof(telescope_id);
-    } else if (strcmp(string,"machine_id")) {
-      nRead = fread(&machine_id,sizeof(machine_id),1,inputfile);
-      totalbytes+=sizeof(machine_id);
-    } else if (strcmp(string,"data_type")) {
-      nRead = fread(&data_type,sizeof(data_type),1,inputfile);
-      totalbytes+=sizeof(data_type);
-    } else if (strcmp(string,"ibeam")) {
-      nRead = fread(&ibeam,sizeof(ibeam),1,inputfile);
-      totalbytes+=sizeof(ibeam);
-    } else if (strcmp(string,"nbeams")) {
-      nRead = fread(&nbeams,sizeof(nbeams),1,inputfile);
-      totalbytes+=sizeof(nbeams);
-    } else if (strcmp(string,"nbits")) {
-      nRead = fread(&nbits,sizeof(nbits),1,inputfile);
-      totalbytes+=sizeof(nbits);
-    } else if (strcmp(string,"barycentric")) {
-      nRead = fread(&barycentric,sizeof(barycentric),1,inputfile);
-      totalbytes+=sizeof(barycentric);
-    } else if (strcmp(string,"pulsarcentric")) {
-      nRead = fread(&pulsarcentric,sizeof(pulsarcentric),1,inputfile);
-      totalbytes+=sizeof(pulsarcentric);
-    } else if (strcmp(string,"nbins")) {
-      nRead = fread(&nbins,sizeof(nbins),1,inputfile);
-      totalbytes+=sizeof(nbins);
-    } else if (strcmp(string,"nsamples")) {
-      nRead = fread(&itmp,sizeof(itmp),1,inputfile);
-      totalbytes+=sizeof(itmp);
-    } else if (strcmp(string,"nifs")) {
-      nRead = fread(&nifs,sizeof(nifs),1,inputfile);
-      totalbytes+=sizeof(nifs);
-    } else if (strcmp(string,"npuls")) {
-      nRead = fread(&npuls,sizeof(npuls),1,inputfile);
-      totalbytes+=sizeof(npuls);
-    } else if (strcmp(string,"refdm")) {
-      nRead = fread(&refdm,sizeof(refdm),1,inputfile);
-      totalbytes+=sizeof(refdm);
-    } else if (expecting_rawdatafile) {
-      strcpy(rawdatafile,string);
-      expecting_rawdatafile=0;
-    } else if (expecting_source_name) {
-      strcpy(source_name,string);
-      expecting_source_name=0;
-    } else {
-      sprintf(message,"read_header - unknown parameter: %s\n",string);
-      fprintf(stderr,"ERROR: %s\n",message);
-      exit(1);
-    } 
-  } 
-
-
-  totalbytes+=nbytes;
-
-  return totalbytes;
-}
-*/
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_fake [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -f file to read packet from [default none]\n"
-	   " -i in_key [default TEST_BLOCK_KEY]\n"
-	   " -o out_key [default REORDER_BLOCK_KEY2]\n"
-	   " -n will not read header\n"
-	   " -b number of blocks to stop after\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_fake", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = 0x0000dada;
-  key_t out_key = 0x0000caca;
-  
-  // command line arguments
-  int core = -1;
-  int useZ = 1;
-  char fnam[100];
-  int arg = 0;
-  int rhead = 1;
-  int nblocks = -1;
-  
-  while ((arg=getopt(argc,argv,"c:f:i:o:nb:dh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      useZ = 0;
-	      strcpy(fnam,optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'b':
-	  if (optarg)
-	    {
-	      nblocks = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-b flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'n':
-	  rhead=0;
-	  syslog (LOG_INFO, "Will not read header");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  uint64_t npackets = 1;
-  char * block, * output_buffer;
-  char * packet;
-  packet = (char *)malloc(sizeof(char)*block_size);
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,0,block_out);
-  uint64_t written, block_id;
-
-  // fill output buffer if file exists
-  FILE *fin;
-  if (!useZ) {
-
-    if (!(fin=fopen(fnam,"rb"))) {
-      syslog(LOG_ERR, "cannot open file - will write zeros");
-    }
-    else {
-
-      // DMH: FIXME
-      //if (rhead) read_header(fin);
-      
-      //		fread(packet,block_out,1,fin);
-      //		fclose(fin);
-      
-      //		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
-      
-      //      for (int i=0;i<npackets;i++)
-      //		memcpy(output_buffer,packet,block_out);
-      
-      //		syslog(LOG_INFO, "Using input packet");
-      
-    }
-
-    
-  }
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0, started = 0;
-  
-  syslog(LOG_INFO, "starting observation");
-  
-  /*if (!(feof(fin)) {
-    fread()
-	}
-	else {
-		close and reopen file
-	}
-*/
-
-  while (!observation_complete) {
-    if (!(feof(fin))) {
-      fread(packet,block_out,1,fin);
-    }
-    else{
-      fclose(fin);
-      fin=fopen(fnam,"rb");
-      // DMH: FIXME
-      //if (rhead) read_header(fin);
-      fread(packet,block_out,1,fin);
-    }
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-    // no need to do anything here - output_buffer is ready to go
-
-	// fread goes here
-	// count blocks, increment, stop loop and reopen file (or rewind)
-
-    // write to output
-    written = ipcio_write (hdu_out->data_block, packet, block_out);
-    if (written < block_out)
-      {
-		syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-		dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-		return EXIT_FAILURE;
-      }
-
-    if (DEBUG) {
-      syslog(LOG_DEBUG, "written block %d",blocks);      
-    }
-    blocks++;
-
-    if (blocks==nblocks)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  fclose(fin);
-  free(packet);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
diff --git a/src/flagger.c b/src/flagger.c
deleted file mode 100644
index 5262015..0000000
--- a/src/flagger.c
+++ /dev/null
@@ -1,484 +0,0 @@
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-
-#define NTIMES_P 4096	// # of time samples (assuming 1ms sampling period)
-#define NCHAN_P 1024	// # of channels on BF node side
-#define NBEAMS_P 64	// # of beams on BF side
-#define M_P NTIMES_P
-#define N_P 32
-#define HDR_SIZE 4096
-#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet
-
-// global variables
-int DEBUG = 0;
-double skarray[NBEAMS_P*NCHAN_P+1];	// array with SK values -- size NCHANS * NBEAMS
-double avgspec[NBEAMS_P*NCHAN_P+1];	// spectrum over all beams to estimate median filter
-double baselinecorrec[NBEAMS_P*NCHAN_P+1];	// spectrum over all beams to estimate median filter
-int cores[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25};
-
-void swap(char *p,char *q) {
-   char t;
-   
-   t=*p; 
-   *p=*q; 
-   *q=t;
-}
-
-double medval(double a[],int n) { 
-	int i,j;
-	char tmp[n];
-	for (i = 0;i < n;i++)
-		tmp[i] = a[i];
-	
-	for(i = 0;i < n-1;i++) {
-		for(j = 0;j < n-i-1;j++) {
-			if(tmp[j] > tmp[j+1])
-				swap(&tmp[j],&tmp[j+1]);
-		}
-	}
-	return tmp[(n+1)/2-1];
-}
-
-/* THREAD FUNCTION */
-
-struct data {
-	unsigned char * indata;
-	double * inSK;
-  unsigned char * output;
-  int cnt;
-	double nThreshUp;
-	int n_threads;
-	int thread_id;
-	int debug;
-};
-
-void noise_inject(void *args) {
-	
-	struct data *d = args;
-	int thread_id = d->thread_id;
-	int dbg = d->debug;
-	// set affinity
-	const pthread_t pid = pthread_self();
-	const int core_id = cores[thread_id];
-	cpu_set_t cpuset;
-	CPU_ZERO(&cpuset);
-	CPU_SET(core_id, &cpuset);
-	const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-	if (set_result != 0)
-		syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-	const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-	if (get_affinity != 0) 
-		syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-	if (CPU_ISSET(core_id, &cpuset))
-	  if (dbg) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-	
-	
-	// noise injection
-	
-	unsigned char *indata = (unsigned char *)d->indata;
-	double *inSK = (double *)d->inSK;
-	unsigned char *output = (unsigned char *)d->output;
-	int * cnt = (int *)d->cnt;
-	double nThreshUp = (double)d->nThreshUp;
-	int nthreads = d->n_threads;
-	int i, j, k;
-	
-	// copy from input to output
-	//memcpy(output,indata,(NBEAMS_P/nthreads)*NTIMES_P*NCHAN_P);
-	
-	//cnt[thread_id] = 0;
-	
-	for (i = 0; i < (int)(NBEAMS_P/nthreads); i++){
-	  for (k = 0; k < NCHAN_P; k++){
-	    if (inSK[i*(int)(NCHAN_P) + k] > nThreshUp){
-	      cnt[thread_id]++;
-	      //if (dbg) syslog(LOG_DEBUG,"thread %d: flagging %d %d: sk %g",thread_id,i,k,inSK[i*(int)(NCHAN_P) + k]);
-	      //for (j = 0; j < NTIMES_P; j++){
-		//output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.);
-		//indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(20. * 1. / ( (double)RAND_MAX ) + 10.);
-	      //}
-
-	      // copy from lookup table
-	      for (j = 0; j < NTIMES_P; j++)
-		indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = output[k*NTIMES_P+j];
-	      
-	    }
-	    /*else{
-	      for (j = 0; j < NTIMES_P; j++){
-	      output[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = indata[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k];
-	      }
-	      }*/
-	  }
-	}
-	
-	
-	
-	if (dbg) syslog(LOG_DEBUG,"thread %d: done - freeing",thread_id);
-	int thread_result = 0;
-	pthread_exit((void *) &thread_result);
-}
-
-/* END THREAD FUNCTION */
-
-void usage()
-{
-  fprintf (stdout,
-	   "flagger [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default dada]\n"
-	   " -o out_key [default caca]\n"
-	   " -n use noise generation rather than zeros\n"
-	   " -t SK threshold [default 5.0]\n"
-	   " -b compute and apply baseline correction\n"
-	   " -h print usage\n");
-}
-
-
-int main(int argc, char**argv)
-{
-
-  // syslog start
-  openlog ("flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  // threads initialization
-  int nthreads = 16;
-  pthread_t threads[nthreads];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  // read command line args
-
-  // data block HDU keys
-  key_t in_key = 0x0000dada;
-  key_t out_key = 0x0000caca;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int noise = 0;
-  double skthresh = 5.0;
-  int bcorr = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:bndh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      skthresh = atof(optarg);
-	      syslog(LOG_INFO,"modified SKTHRESH to %g",skthresh);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'n':
-	  noise=1;
-	  syslog (LOG_INFO, "Will generate noise samples");
-	  break;	  
-	case 'b':
-	  bcorr=1;
-	  syslog (LOG_INFO, "Will calculate and apply baseline correction");
-	  break;	  
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-  
-  
-  // CONNECT AND READ FROM BUFFER
-
-  dada_hdu_t* hdu_in = 0;	// header and data unit
-  uint64_t blocksize = NTIMES_P*NCHAN_P*NBEAMS_P;	// size of buffer
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to input buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to input buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-  // read the header from the input HDU
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  
-  // mark the input header as cleared
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){
-    syslog (LOG_ERR,"could not mark header as cleared");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t block_id, bytes_read = 0;
-  unsigned char *in_data;
-  char *cin_data;
-	     	
-  // OUTPUT BUFFER
-  dada_hdu_t* hdu_out = 0;
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"flagged_data: could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write (hdu_out) < 0) {
-    syslog (LOG_ERR,"flagged_data: could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-	
-  /* //read fake header for now
-	char head_dada[4096];
-	FILE *f = fopen("/home/dsa/dsa110-xengine/src/correlator_header_dsaX.txt", "rb");
-	fread(head_dada, sizeof(char), 4096, f);
-	fclose(f); */
-  
-  //// OUTPUT BUFFER
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  header_size = HDR_SIZE;
-  if (!header_out)
-    {
-      syslog(LOG_ERR,"couldn't read header_out");
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      return EXIT_FAILURE;
-    }
-  uint64_t written=0;
-  
-  ////////////////		
-	
-  double S1 = 0;
-  double S2 = 0;
-  double sampval;
-  double nThreshUp = skthresh;	// Threshold to apply to SK (empirical estimation)
-  struct data args[16];
-  int * flag_counts = (int *)malloc(sizeof(int)*nthreads);
-  //unsigned char * output = (unsigned char *)malloc(sizeof(char)*NBEAMS_P*NCHAN_P*NTIMES_P);
-  int nFiltSize = 21;
-  int cnt = 0;
-
-  // make array of random numbers
-  unsigned char * lookup_rand = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P);
-  for (int i=0;i<NTIMES_P*NCHAN_P;i++) 
-    lookup_rand[i] = (unsigned char)(20. * rand() / ( (double)RAND_MAX ) + 10.);
-  
-  // put rest of the code inside while loop
-  while (1) {	
-    
-    // read a DADA block
-    cin_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-    in_data = (unsigned char *)(cin_data);
-    
-    // compute SK and averaged spectrum
-    S1 = 0;
-    S2 = 0;
-    sampval = 0;
-		
-    for (int i = 0; i < NBEAMS_P; i++){
-      for (int k = 0; k < NCHAN_P; k++){
-	for (int j = 0; j < NTIMES_P; j++){
-	  sampval = (double)in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k];
-	  avgspec[i*(int)(NCHAN_P) + k] += sampval / NTIMES_P;
-	  S1 += sampval;
-	  S2 += sampval * sampval;
-	  skarray[i*(int)(NCHAN_P) + k] = (double)((M_P*N_P+1) / (M_P-1) * ( (M_P*S2)/(S1*S1) - 1 ));
-	}
-	S1 = 0;
-	S2 = 0;
-      }
-    }
-    if (DEBUG) syslog (LOG_DEBUG,"has computed SK.");
-    if (DEBUG) syslog(LOG_DEBUG,"example SK value : %g", (double)skarray[10]);
-		
-    // compute baseline correction
-    if (bcorr) {
-      for (int i = 0; i < NBEAMS_P*NCHAN_P-nFiltSize; i++)
-	baselinecorrec[i] = medval(&avgspec[i],nFiltSize);
-    }
-    		
-    
-    // compare SK values to threshold and
-    // replace thresholded channels with noise or 0
-    
-    if (noise){
-
-      for (int i=0;i<nthreads;i++) flag_counts[i] = 0;
-      for (int i=0; i<nthreads; i++) {
-	args[i].indata = in_data + i*(int)((NBEAMS_P/nthreads)*NCHAN_P*NTIMES_P);
-	args[i].inSK = skarray + i*(int)(NBEAMS_P/nthreads*NCHAN_P);
-	args[i].output = lookup_rand;
-	args[i].cnt = flag_counts;
-	args[i].nThreshUp = nThreshUp;
-	args[i].n_threads = nthreads;
-	args[i].thread_id = i;
-	args[i].debug = DEBUG;
-      }
-      if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-      for(int i=0; i<nthreads; i++){
-	if (pthread_create(&threads[i], &attr, &noise_inject, (void *)(&args[i]))) {
-	  syslog(LOG_ERR,"Failed to create noise_inject thread %d\n", i);
-	}
-      }
-      /*for(int i=0; i<nthreads; i++){
-	for(int j=0; j<(int)(NBEAMS_P/nthreads*NCHAN_P*NTIMES_P); i++){
-	  in_data[i*(int)(NBEAMS_P/nthreads*NCHAN_P*NTIMES_P)+j] = args[i].output[j];
-	}
-	}*/
-      pthread_attr_destroy(&attr);
-
-      for(int i=0; i<nthreads; i++){
-	pthread_join(threads[i], &result);
-	if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-      }
-
-      cnt = 0;
-      for(int i=0; i<nthreads; i++) cnt += flag_counts[i];
-      //memcpy(in_data,output,sizeof(in_data));
-    }
-    else{
-      for (int i = 0; i < NBEAMS_P; i++){
-	for (int k = 0; k < NCHAN_P; k++){
-	  if (skarray[i*(int)(NCHAN_P) + k] > nThreshUp){
-	    cnt++;
-	    for (int j = 0; j < NTIMES_P; j++){
-	      in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = 0;
-	    }
-	  }
-	}
-      }
-    }
-    syslog (LOG_INFO,"%d channels*baselines flagged",cnt);
-		
-    // apply baseline correction
-    if (bcorr) {
-      for (int i = 0; i < NBEAMS_P; i++){
-	for (int k = 0; k < NCHAN_P; k++){
-	  for (int j = 0; j < NTIMES_P; j++){
-	    //in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] / (unsigned char)baselinecorrec[i*(int)NCHAN_P+k]);
-	    in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k] = (unsigned char)((double)(in_data[i*(int)(NCHAN_P*NTIMES_P)+j*(int)NCHAN_P+k]) / baselinecorrec[i*(int)NCHAN_P+k]);
-	  }
-	}
-      }
-      
-      syslog (LOG_DEBUG,"baseline correction applied");
-    }
-		
-    // close block after reading
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    if (DEBUG) syslog(LOG_DEBUG,"closed read block");		
-    
-    written = ipcio_write (hdu_out->data_block, (char *)(in_data), BUF_SIZE);
-    if (written < BUF_SIZE)
-      {
-	syslog(LOG_ERR,"write error");
-	return EXIT_FAILURE;
-      }
-
-    if (DEBUG) syslog (LOG_DEBUG,"write flagged data done.");
-		
-    
-  }
-
-  free(lookup_rand);
-  return 0;    
-} 
diff --git a/src/gpu_flagger.cu b/src/gpu_flagger.cu
deleted file mode 100644
index 07e6f5c..0000000
--- a/src/gpu_flagger.cu
+++ /dev/null
@@ -1,1547 +0,0 @@
-// -*- c++ -*-
-/*#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-1;95;0c#include <thread>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-*/
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <thread>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <arpa/inet.h>
-#include <sys/syscall.h>
-#include <syslog.h>
-#include <curand.h>
-#include <curand_kernel.h>
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-
-#include <src/sigproc.h>
-#include <src/header.h>
-
-
-#define NTIMES_P 16384  // # of time samples (assuming 1ms sampling period)
-#define NCHAN_P 1024	// # of channels on BF node side
-#define NBEAMS_P 64	// # of beams on BF side
-#define M_P NTIMES_P
-#define N_P 32
-#define HDR_SIZE 4096
-#define BUF_SIZE NTIMES_P*NCHAN_P*NBEAMS_P // size of TCP packet
-#define NTHREADS_GPU 32
-#define MN 48.0
-#define SIG 6.0
-#define RMAX 16384
-//#define NPERMFLAGS 58
-#define NPERMFLAGS 1
-#define TBIN 128
-#define FBIN 8
-
-// global variables
-int DEBUG = 0;
-//int flagchannels[58] = {737,738,753,754,721,722,723,724,725,726,727,728,729,627,628,629,630,631,632,633,634,603,604,605,606,607,608,609,610,578,579,580,581,582,583,584,585,590,591,592,593,594,595,596,597,598,680,681,682,683,684,685,686,687,688,327,328,329};
-int flagchannels[1] = {10};
-/* global variables */
-int quit_threads = 0;
-int dump_pending = 0;
-int trignum = 0;
-char iP[100];
-char footer_buf[1024];
-char flnam[1024];
-int dumpbm;
-
-// structure for pulse injection
-typedef struct {
-
-  int verbose;
-  float * block;
-
-} dsaX_pulse_t;
-
-
-
-
-// kernel to calculate median spectrum
-// only works on <NTHREADS_GPU/2 in median
-__global__
-void fix_zspec(float * s0, float * v0, int naver) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  int tid;
-  int ct_lt = 0;
-  
-  // sorted place
-  int place = (int)(naver/2);
-
-  // copy into shared memory
-  extern __shared__ float vec[];
-
-  // for mean spec
-  if (thread_id<naver) {
-
-    tid=thread_id;
-    vec[thread_id] = s0[tid*NBEAMS_P*NCHAN_P + block_id];
-
-  }
-
-  // for var spec
-  if (thread_id>=naver && thread_id<2*naver) {
-
-    tid=thread_id-naver;
-    vec[thread_id] = v0[tid*NBEAMS_P*NCHAN_P + block_id];
-
-  }
-
-  __syncthreads();
-
-  if (thread_id<naver) {   
-    for (int i=0;i<naver;i++) {
-      if (i!=thread_id) {
-	if (vec[i]<=vec[thread_id]) ct_lt++;
-      }
-    }
-  }
-
-  if (thread_id>=naver && thread_id<2*naver) {   
-    for (int i=naver;i<2*naver;i++) {
-      if (i!=thread_id) {
-	if (vec[i]<=vec[thread_id]) ct_lt++;
-      }
-    }
-  }
-
-  __syncthreads();
-
-
-  if (thread_id<naver) 
-    if (ct_lt==place) s0[block_id] = vec[thread_id];
-  
-  if (thread_id>=naver && thread_id<2*naver)
-    if (ct_lt==place) v0[block_id] = vec[thread_id];
-  
-}
-
-// kernel to calculate mean spectrum
-// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
-__global__
-void calc_spectrum(unsigned char *data, float * spectrum) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  __shared__ float csum[NTHREADS_GPU];
-  csum[thread_id] = 0.;
-
-  int bm =(int)( block_id/NCHAN_P);
-  int ch = (int)(block_id % (NCHAN_P));
-  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
-  
-  // find sum of local times
-  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
-  for (int tm=0; tm<NTIMES_P/NTHREADS_GPU; tm++) {    
-    csum[thread_id] += (float)(data[idx0]);
-    idx0 += NCHAN_P;
-  }
-
-  __syncthreads();
-  
-  // sum into shared memory
-  if (thread_id<16) {
-    csum[thread_id] += csum[thread_id+16];
-    __syncthreads();
-    csum[thread_id] += csum[thread_id+8];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+4];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+2];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+1];
-      __syncthreads();
-  }
-  /*  
-  int maxn = NTHREADS_GPU/2;
-  int act_maxn = maxn;
-  if (thread_id<maxn) {
-    while (act_maxn>0) {
-      csum[thread_id] += csum[thread_id+act_maxn];
-      act_maxn = (int)(act_maxn/2);
-    }
-  }
-  */
-  
-  if (thread_id==0) {    
-    spectrum[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P);
-  }
-
-}
-
-
-// kernel to calculate variance spectrum
-// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
-__global__
-void calc_varspec(unsigned char *data, float * spectrum, float * varspec) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  __shared__ float csum[NTHREADS_GPU];
-  csum[thread_id] = 0.;
-
-  int bm =(int)( block_id/NCHAN_P);
-  int ch = (int)(block_id % (NCHAN_P));
-  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
-  float val;
-  
-  // find sum of local times
-  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
-  for (int tm=0; tm<NTIMES_P/NTHREADS_GPU; tm++) {    
-    val = (float)(data[idx0]) - spectrum[bm*NCHAN_P + ch];
-    csum[thread_id] += val*val;
-    idx0 += NCHAN_P;
-  }
-  
-  __syncthreads();
-  
-  // sum into shared memory
-  if (thread_id<16) {
-    csum[thread_id] += csum[thread_id+16];
-    __syncthreads();
-    csum[thread_id] += csum[thread_id+8];
-        __syncthreads();
-    csum[thread_id] += csum[thread_id+4];
-        __syncthreads();
-    csum[thread_id] += csum[thread_id+2];
-        __syncthreads();
-    csum[thread_id] += csum[thread_id+1];
-        __syncthreads();
-  }
-  /*
-  int maxn = NTHREADS_GPU/2;
-  int act_maxn = maxn;
-  if (thread_id<maxn) {
-    while (act_maxn>0) {
-      csum[thread_id] += csum[thread_id+act_maxn];
-      act_maxn = (int)(act_maxn/2);
-    }
-    }*/
-
-  if (thread_id==0) {    
-    varspec[bm*NCHAN_P+ch] = csum[thread_id] / (1.*NTIMES_P);
-  }
-
-}
-
-// kernel to calculate maximum value
-// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
-__global__
-void calc_maxspec(unsigned char *data, float * maxspec) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  __shared__ float csum[NTHREADS_GPU];
-  csum[thread_id] = 0.;
-
-  int bm =(int)( block_id/NCHAN_P);
-  int ch = (int)(block_id % (NCHAN_P));
-  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
-  float val=0.;
-  
-  // find max of local times
-  int idx0 = bm*NTIMES_P*NCHAN_P + tm0*NCHAN_P + ch;
-  for (int i=idx0;i<idx0+NCHAN_P*(NTIMES_P/NTHREADS_GPU);i+=NCHAN_P) {
-    if ((float)(data[i])>val) val = (float)(data[i]);
-  }
-  csum[thread_id] = val;
-  
-  __syncthreads();
-  
-  // sum into shared memory
-  int maxn = NTHREADS_GPU/2;
-  int act_maxn = maxn;
-  if (thread_id<maxn) {
-    while (act_maxn>0) {
-      if (csum[thread_id]<csum[thread_id+act_maxn])
-	csum[thread_id]=csum[thread_id+act_maxn];
-      act_maxn = (int)(act_maxn/2);
-    }
-  }
-
-  if (thread_id==0) {    
-    maxspec[bm*NCHAN_P+ch] = csum[thread_id];
-  }
-
-}
-
-// kernel to calculate p-p spec with binning (default 128)
-// launch with NBEAMS_P*NCHAN_P blocks of NTHREADS_GPU threads 
-__global__
-void calc_ppspec(unsigned char *data, float * ppspec) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  __shared__ float csum[NTHREADS_GPU];
-  csum[thread_id] = 0.;
-
-  int bm =(int)( block_id/NCHAN_P);
-  int ch = (int)(block_id % (NCHAN_P));
-  int tm0 = (int)(thread_id*(NTIMES_P/NTHREADS_GPU));
-  float val=0.;
-
-  // local times start at tm0
-  float vv;
-  int idx0;
-  
-  // find max of local times
-  for (int j=0;j<(NTIMES_P/NTHREADS_GPU)/TBIN;j++) {
-    idx0=bm*NTIMES_P*NCHAN_P + (tm0+j*TBIN)*NCHAN_P + ch;
-    vv = 0.;    
-    for (int i=idx0;i<idx0+NCHAN_P*TBIN;i+=NCHAN_P) 
-      vv += (float)(data[i]);
-    vv /= (1.*TBIN);      
-    if (vv>val) val = vv;
-  }
-  csum[thread_id] = val;
-  
-  __syncthreads();
-  
-  // sum into shared memory
-  int maxn = NTHREADS_GPU/2;
-  int act_maxn = maxn;
-  float v1;
-  if (thread_id<maxn) {
-    while (act_maxn>0) {
-      if (csum[thread_id]<csum[thread_id+act_maxn])
-	csum[thread_id]=csum[thread_id+act_maxn];
-      act_maxn = (int)(act_maxn/2);
-    }
-  }
-  if (thread_id==0) v1=csum[thread_id];
-  act_maxn = maxn;
-  if (thread_id<maxn) {
-    while (act_maxn>0) {
-      if (csum[thread_id]>csum[thread_id+act_maxn])
-	csum[thread_id]=csum[thread_id+act_maxn];
-      act_maxn = (int)(act_maxn/2);
-    }
-  }
-  if (thread_id==0)
-    ppspec[bm*NCHAN_P+ch] = v1-csum[thread_id];
-
-}
-
-
-// kernel to scale data
-// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads
-__global__
-void scaley(unsigned char *data, float *spectrum, float *varspec) {
-
-  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
-  int bm = (int)(idx / (NTIMES_P*NCHAN_P));
-  int ch = (int)(idx % NCHAN_P);
-  int spidx = bm*NCHAN_P+ch;
-
-  float val = (float)(data[idx]);
-  val = (val-spectrum[spidx])*(SIG/sqrtf(varspec[spidx])) + MN;
-  data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2);
-  
-
-}
-
-// kernel to add pulse to data
-// launch with NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads
-__global__
-void sumpulse(unsigned char *data, float *summand) {
-
-  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
-  float val = (float)(data[idx]);
-  val += summand[idx];
-  data[idx] = (unsigned char)((__float2uint_rn(2.*val))/2);
-  
-}
-
-
-
-
-// kernel to make time series from data
-// run with NBEAMS_P*NTIMES_P blocks of 32 threads
-__global__
-void make_ts(unsigned char *data, float *ts) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  int idx = blockIdx.x*NTHREADS_GPU + threadIdx.x;
-  int bm = (int)(blockIdx.x/NTIMES_P);
-  int tm = (int)(blockIdx.x % NTIMES_P);
-  int ch0 = (int)(thread_id*(NCHAN_P/NTHREADS_GPU));
-
-  __shared__ float csum[NTHREADS_GPU];
-  csum[thread_id] = 0.;
-  
-  // find sum of local chans
-  int idx0 = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch0;
-  for (int ch=0; ch<NCHAN_P/NTHREADS_GPU; ch++) {    
-    csum[thread_id] += (float)(data[idx0]);
-    idx0++;
-  }
-
-  __syncthreads();
-  
-  // sum into shared memory
-  if (thread_id<16) {
-    csum[thread_id] += csum[thread_id+16];
-    __syncthreads();
-    csum[thread_id] += csum[thread_id+8];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+4];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+2];
-      __syncthreads();
-    csum[thread_id] += csum[thread_id+1];
-      __syncthreads();
-  }
-  
-  if (thread_id==0) {    
-    ts[bm*NTIMES_P+tm] = csum[thread_id] / (1.*NCHAN_P);
-  }
-  
-}
-
-
-// kernel to do flagging
-// launch with n_mask*NTIMES_P/NTHREADS_GPU blocks of NTHREADS_GPU threads 
-__global__
-void flag(unsigned char *data, int * midx, unsigned char *repval, float *bpwr) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  int midx_idx = (int)(block_id/(NTIMES_P/NTHREADS_GPU));
-  
-  int bm = (int)(midx[midx_idx] / NCHAN_P);
-  int ch = (int)(midx[midx_idx] % NCHAN_P);
-  int tm = ((int)(block_id % (NTIMES_P/NTHREADS_GPU)))*NTHREADS_GPU + thread_id;
-  int idx = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch;  
-
-  // do replacement
-  //data[idx] = repval[ch*NTIMES_P+tm]*bpwr[bm];
-  data[idx] = MN*bpwr[bm];
-    
-}
-
-// kernel to do time-series flagging
-// launch with n_mask*(NCHAN_P-256)/NTHREADS_GPU blocks of NTHREADS_GPU threads 
-__global__
-void flagts(unsigned char *data, int * midx, unsigned char *repval, float *bpwr) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  int midx_idx = (int)(block_id/((NCHAN_P-256)/NTHREADS_GPU));
-  
-  int bm = (int)(midx[midx_idx] / NTIMES_P);
-  int tm = (int)(midx[midx_idx] % NTIMES_P);
-  int ch = ((int)(block_id % ((NCHAN_P-256)/NTHREADS_GPU)))*NTHREADS_GPU + thread_id + 128;
-  int idx = bm*NTIMES_P*NCHAN_P + tm*NCHAN_P + ch;  
-
-  // do replacement
-  //data[idx] = repval[ch*NTIMES_P+tm]*bpwr[bm];
-  data[idx] = MN*bpwr[bm];
-    
-}
-
-
-// kernel to make random numbers
-// launch with NTIMES_P*NCHAN_P/NTHREADS_GPU blocks of NTHREADS_GPU threads 
-__global__
-void genrand(unsigned char *repval, unsigned int seed) {
-
-  int block_id = blockIdx.x;
-  int thread_id = threadIdx.x;
-  
-  // for random number
-  curandState_t state;
-  float u1, u2, va;
-  curand_init(seed, block_id*NTHREADS_GPU+thread_id, 1, &state);
-  u1 = ((float)(curand(&state) % RMAX))/(1.*RMAX);
-  u2 = ((float)(curand(&state) % RMAX))/(1.*RMAX);
-  va = sqrtf(-2.*logf(u1))*cosf(2.*M_PI*u2);
-
-  // do replacement
-  repval[block_id*NTHREADS_GPU+thread_id] = (unsigned char)(__float2uint_rn(2.*(va*SIG+MN))/2);
-    
-}
-
-
-
-// assumed spec has size NBEAMS_P*NCHAN_P
-// ref is reference value
-void genmask(float *spec, float thresh, float ref, int *mask) {
-
-  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
-    if (fabs(spec[i]-ref)>thresh) mask[i] = 1;
-  }
-
-}
-
-
-
-float medval(float *a,int n);
-
-float medval(float *a,int n) { 
-  int i,j;
-  float tmp[n], tt;
-  for (i = 0;i < n;i++)
-    tmp[i] = a[i];
-  
-  for(i = 0;i < n-1;i++) {
-    for(j = 0;j < n-i-1;j++) {
-      if(tmp[j] > tmp[j+1]) {
-
-	tt = tmp[j+1];
-	tmp[j+1] = tmp[j];
-	tmp[j] = tt;
-
-      }
-    }
-  }
-
-  return tmp[(int)((n+1)/2-1)];
-}
-
-void channflag(float* spec, float Thr, int * mask);
-void simple_channflag(float* spec, float Thr, int * mask);
-void simple_tsflag(float* ts, float Thr, int * mask);
-
-void simple_channflag(float* spec, float Thr, int * mask) {
-	
-  int i, j;
-  float* medspec;			// median values for each beam spectrum
-  float* madspec;			// mad for each beam spectrum
-  float* normspec;			// corrected spec - median value (for MAD calculation)
-
-  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-    
-  int ZeroChannels = 128; 
-  int nFilt, idx;
-  	
-  // calculate median value for each beam
-  for (i = 0; i < NBEAMS_P; i++)
-    medspec[i] = medval(spec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels);
-  
-  // compute MAD for each beam
-  for (i = 0; i < NBEAMS_P; i++){
-    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
-      normspec[j-ZeroChannels] = fabs(spec[i*NCHAN_P+j]-medspec[i]);
-    }
-    madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels);
-  }
-	
-  // mask
-  float vv;
-  float mythr = Thr/sqrt(1.*FBIN);
-  for (i = 0; i < NBEAMS_P; i++){
-
-    // implement FBIN    
-    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels-FBIN; j++) {
-      vv = 0.;
-      for (int k=0;k<FBIN;k++)
-	vv += spec[i*NCHAN_P+j];
-      vv = (vv/(1.*FBIN)-medspec[i]);
-
-      if (vv > mythr*madspec[i]) mask[i*NCHAN_P+j] = 1;
-      
-    }
-    
-  }
-  
-  free(medspec);
-  free(madspec);
-  free(normspec);
-  
-}
-
-void simple_tsflag(float* spec, float Thr, int * mask) {
-	
-  int i, j;
-  float* medspec;			// median values for each beam spectrum
-  float* madspec;			// mad for each beam spectrum
-  float* normspec;			// corrected spec - median value (for MAD calculation)
-
-  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P);
-    
-  int nFilt, idx;
-  	
-  // calculate median value for each beam
-  for (i = 0; i < NBEAMS_P; i++)
-    medspec[i] = medval(spec + i*NTIMES_P,NTIMES_P/16);
-  
-  // compute MAD for each beam
-  for (i = 0; i < NBEAMS_P; i++){
-    for (j = 0; j < NTIMES_P/16; j++){
-      normspec[j] = fabs(spec[i*NTIMES_P+j]-medspec[i]);
-    }
-    madspec[i] = medval(normspec,NTIMES_P/16);
-  }
-	
-  // mask
-  float vv;
-  float mythr = Thr;
-  for (i = 0; i < NBEAMS_P; i++){
-
-    for (j = 0; j < NTIMES_P; j++) {
-
-      vv = spec[i*NTIMES_P+j]-medspec[i];
-      if (vv > mythr*madspec[i]) mask[i*NTIMES_P+j] = 1;
-      
-    }
-    
-  }
-  
-  free(medspec);
-  free(madspec);
-  free(normspec);
-  
-}
-
-
-void channflag(float* spec, float Thr, int * mask) {
-	
-  int i, j;
-  float* baselinecorrec;	// baseline correction
-  float* CorrecSpec;			// corrected spectrum
-  float* medspec;			// median values for each beam spectrum
-  float* madspec;			// mad for each beam spectrum
-  float* normspec;			// corrected spec - median value (for MAD calculation)
-
-  baselinecorrec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  CorrecSpec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  medspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  madspec = (float *)malloc(sizeof(float)*NBEAMS_P);
-  normspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  
-  
-  int ZeroChannels = 128; 
-  int nFiltSize = 21;
-  int nFilt, idx;
-  
-  // calculate median filtered spectrum
-  for (i=0;i<NBEAMS_P;i++) {
-    for (j=ZeroChannels;j<NCHAN_P-ZeroChannels;j++) {
-      
-      if (NCHAN_P-ZeroChannels-j>=nFiltSize)
-	CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+j,nFiltSize);
-      else
-	CorrecSpec[i*NCHAN_P+j] = spec[i*NCHAN_P+j] - medval(spec + i*NCHAN_P+NCHAN_P-ZeroChannels-nFiltSize,nFiltSize);
-
-    }
-  }
-	
-  // calculate median value for each beam
-  for (i = 0; i < NBEAMS_P; i++)
-    medspec[i] = medval(CorrecSpec + i*NCHAN_P + ZeroChannels,NCHAN_P-2*ZeroChannels);
-  
-  // compute MAD for each beam
-  for (i = 0; i < NBEAMS_P; i++){
-    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
-      normspec[j-ZeroChannels] = fabs(CorrecSpec[i*NCHAN_P+j]-medspec[i]);
-    }
-    madspec[i] = medval(normspec,NCHAN_P-2*ZeroChannels);
-  }
-	
-  // mask  
-  for (i = 0; i < NBEAMS_P; i++){    
-    for (j = ZeroChannels; j < NCHAN_P-ZeroChannels; j++){
-      if (CorrecSpec[i*NCHAN_P+j] > Thr * madspec[i] || CorrecSpec[i*NCHAN_P+j] < - Thr * madspec[i])
-	mask[i*NCHAN_P+j] = 1;
-
-      // for permanent flagging
-      for (int kk=0;kk<NPERMFLAGS;kk++) {
-	if (j==flagchannels[kk]) mask[i*NCHAN_P+j] = 1;
-      }
-      
-    }
-    
-  }
-  
-  free(baselinecorrec);
-  free(CorrecSpec);
-  free(medspec);
-  free(madspec);
-  free(normspec);
-  
-}
-
-
-// to gather mask indices
-void gather_mask(int *h_idx, int *h_mask, int *n_mask) {
-
-  (*n_mask) = 0;
-  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
-    if (h_mask[i]==1) {      
-      h_idx[(*n_mask)] = i;
-      //if (DEBUG) syslog(LOG_INFO,"%d %d %d",i,h_mask[i],(*n_mask));
-      (*n_mask) += 1;
-    }
-  }
-
-}
-
-// to gather ts mask indices
-void gather_tsmask(int *h_idx, int *h_mask, int *n_mask) {
-
-  (*n_mask) = 0;
-  for (int i=0;i<NBEAMS_P*NTIMES_P;i++) {
-    if (h_mask[i]==1) {      
-      h_idx[(*n_mask)] = i;
-      //if (DEBUG) syslog(LOG_INFO,"%d %d %d",i,h_mask[i],(*n_mask));
-      (*n_mask) += 1;
-    }
-  }
-
-}
-
-
-// to calculate bpwr from spectrum
-void calc_bpwr(float *h_spec, float *h_bpwr);
-void calc_bpwr(float *h_spec, float *h_bpwr) {
-
-  for (int i=0;i<NBEAMS_P;i++) {
-    h_bpwr[i] = 0.;
-    for (int j=0;j<NCHAN_P;j++) 
-      h_bpwr[i] += h_spec[i*NCHAN_P+j];
-    h_bpwr[i] = (h_bpwr[i]/(1.*(NCHAN_P-256)))/MN;
-    
-  }
-
-}
-
-// to medianise zero specs
-void median_calc(float * arr);
-void median_calc(float * arr) {
-
-  int stride = NCHAN_P;
-  float tt;
-  
-  for (int chan=0;chan<NCHAN_P;chan++) {
-
-    for(int i = 0;i < NBEAMS_P-1;i++) {
-      for(int j = 0;j < (NBEAMS_P-i-1);j++) {
-
-	if(arr[j*stride+chan] > arr[(j+1)*stride+chan]) {
-
-	  tt = arr[(j+1)*stride+chan];
-	  arr[(j+1)*stride+chan] = arr[(j)*stride+chan];
-	  arr[(j)*stride+chan] = tt;
-
-	}
-      }
-    }
-
-  }
-
-  for (int i=0;i<NCHAN_P;i++)
-    arr[i] = arr[i+31*NCHAN_P];
-
-  for (int j=1;j<NBEAMS_P;j++) {
-    for (int i=0;i<NCHAN_P;i++)
-      arr[j*NCHAN_P + i] = arr[i];
-  }
-
-}
-
-// Thread to control the adding of filterbanks
-void control_thread (dsaX_pulse_t * ctx) {
-
-  syslog(LOG_INFO, "control_thread: starting");
-
-  // buffer for incoming command strings, and setup of socket
-  int bufsize = 1024;
-  char* buffer = (char *) malloc (sizeof(char) * bufsize);
-  char* tbuf = (char *) malloc (sizeof(char) * bufsize);
-  memset(buffer, '\0', bufsize);
-  const char* whitespace = " ";
-  char * command = 0;
-  char * args = 0;
-  double * tmpblock = (double *)malloc(sizeof(double)*NTIMES_P*NCHAN_P);
-
-  struct addrinfo hints;
-  struct addrinfo* res=0;
-  memset(&hints,0,sizeof(hints));
-  struct sockaddr_storage src_addr;
-  socklen_t src_addr_len=sizeof(src_addr);
-  hints.ai_family=AF_INET;
-  hints.ai_socktype=SOCK_DGRAM;
-  getaddrinfo(iP,"11228",&hints,&res);
-  int fd;
-  ssize_t ct;
-  char tmpstr;
-  char cmpstr = 'p';
-  char *endptr;
-  uint64_t tmps;
-  char * token;
-  double maxval;
-
-  FILE *fin;
-  
-  while (!quit_threads) {
-    
-    fd = socket(res->ai_family,res->ai_socktype,res->ai_protocol);
-    bind(fd,res->ai_addr,res->ai_addrlen);
-    memset(buffer,'\0',sizeof(buffer));
-    syslog(LOG_INFO, "control_thread: waiting for packet");
-    ct = recvfrom(fd,buffer,1024,0,(struct sockaddr*)&src_addr,&src_addr_len);
-    
-    syslog(LOG_INFO, "control_thread: received buffer string %s",buffer);
-    strcpy(tbuf,buffer);
-    trignum++;
-
-    // interpret buffer string    
-    char * rest = buffer;
-    int tmp_dumpbm = (float)(strtof(strtok(rest, "-"),&endptr));
-    if (tmp_dumpbm<0 || tmp_dumpbm>63) tmp_dumpbm=32;
-    char * tmp_flnam = strtok(NULL, "-");
-    
-    if (!dump_pending) {
-      strcpy(flnam,tmp_flnam);
-      dumpbm = tmp_dumpbm;
-      syslog(LOG_INFO, "control_thread: received command to add pulse %s to beam %d",flnam,dumpbm);
-      if (!(fin=fopen(flnam,"rb"))) {
-	syslog(LOG_INFO,"cannot open %s",flnam);
-      }
-      else {
-	fread(tmpblock,sizeof(double),1024*16384,fin);
-
-	// do manipulation of data
-	maxval = 0.;
-	for (int i=0;i<16384*1024;i++) {
-	  if (tmpblock[i]>maxval) maxval = tmpblock[i];
-	}
-	for (int i=0;i<16384;i++) {
-	  for (int j=0;j<1024;j++) {
-	    //ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]*2.*SIG/maxval);
-	    ctx->block[i*1024+j] = (float)(tmpblock[j*16384+i]);
-	  }
-	}
-	
-	fclose(fin);
-	syslog(LOG_INFO, "control_thread: finished processing pulse - setting dump_pending");
-      }
-    }
-	
-    if (dump_pending) {
-      syslog(LOG_ERR, "control_thread: BACKED UP - ignoring %s",tbuf);
-    }
-  
-    if (!dump_pending) dump_pending = 1;
-    
-    close(fd);
-    
-  }
-
-  free (buffer);
-  free (tbuf);
-  free(tmpblock);
-
-  if (ctx->verbose)
-    syslog(LOG_INFO, "control_thread: exiting");
-
-}
-
-
-void usage()
-{
-  fprintf (stdout,
-	   "flagger [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default dada]\n"
-	   " -o out_key [default caca]\n"
-	   " -t flagging threshold [default 5.0]\n"
-	   " -f output spectra file\n"
-	   " -g output beam power file\n"
-	   " -n number of blocks in baseline spec aver (must be <=16 and >=1, default 5)\n"
-	   " -p adjust noise level according to power\n"
-	   " -m generate random data\n"
-	   " -s time-series flagging and threshold [no default]\n"
-	   " -q modulation index threshold for tot pwr flagging [default 0.0005]\n"
-	   " -h print usage\n");
-}
-
-
-int main(int argc, char**argv)
-{
-
-  // syslog start
-  openlog ("gpu_flagger", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  // set cuda device
-  cudaSetDevice(1);
-  
-  // read command line args
-
-  // data block HDU keys
-  key_t in_key = 0x0000dada;
-  key_t out_key = 0x0000caca;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  double thresh = 5.0;
-  float mod_thresh = 0.0005;
-  int naver = 5;
-  char * fnam;
-  char * fnam2;
-  FILE *fout;
-  FILE *fout2;
-  FILE *f0;
-  
-  fnam = (char *)malloc(sizeof(char)*200);
-  fnam2 = (char *)malloc(sizeof(char)*200);
-  int fwrite = 0;
-  int fwrite2 = 0;
-  int pwr = 0;
-  int mkrand = 0;
-  int tsflag = 0;
-  float tsthresh = 10.;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:f:g:a:k:s:mdph")) != -1)
-    {
-      switch (arg)
-	{
-	case 'k':
-	  strcpy(iP,optarg);
-	  break;	
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-	    {
-	      strcpy(fnam,optarg);
-	      fwrite = 1;
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'q':
-	  if (optarg)
-	    {
-	      mod_thresh = atof(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-q flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'a':
-	  if (optarg)
-	    {
-	      naver = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'g':
-	  if (optarg)
-	    {
-	      //strcpy(fnam2,optarg);
-	      sprintf(fnam2,"%s_%f.dat",optarg,40587.0+time(NULL)/86400.0);
-	      fwrite2 = 1;
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-g flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      thresh = atof(optarg);
-	      syslog(LOG_INFO,"modified THRESH to %g",thresh);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 's':
-	  if (optarg)
-	    {
-	      tsthresh = atof(optarg);
-	      tsflag=1;
-	      syslog(LOG_INFO,"TSTHRESH is %g",tsthresh);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-s flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'p':
-	  pwr=1;
-	  break;
-	case 'm':
-	  mkrand=1;
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  dsaX_pulse_t udpdb;
-  udpdb.verbose = DEBUG;
-  float * pulsedata = (float *)malloc(sizeof(float)*256*16384*1024);
-  udpdb.block = pulsedata;
-  
-  // CONNECT AND READ FROM BUFFER
-
-  dada_hdu_t* hdu_in = 0;	// header and data unit
-  hdu_in  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to input buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to input buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (DEBUG) syslog(LOG_INFO,"connected to input buffer");
-  
-  uint64_t header_size = 0;
-  // read the header from the input HDU
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  
-  // mark the input header as cleared
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0){
-    syslog (LOG_ERR,"could not mark header as cleared");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t block_id, bytes_read = 0;
-  unsigned char *in_data;
-  char *cin_data;
-	     	
-  // OUTPUT BUFFER
-  dada_hdu_t* hdu_out = 0;
-  hdu_out  = dada_hdu_create ();
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"flagged_data: could not connect to dada buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write (hdu_out) < 0) {
-    syslog (LOG_ERR,"flagged_data: could not lock to dada buffer");
-    return EXIT_FAILURE;
-  }
-
-  if (DEBUG) syslog(LOG_INFO,"connected to output");
-  
-  
-  //// OUTPUT BUFFER
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  header_size = HDR_SIZE;
-  if (!header_out)
-    {
-      syslog(LOG_ERR,"couldn't read header_out");
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      return EXIT_FAILURE;
-    }
-  uint64_t written=0;
-
-  if (DEBUG) syslog(LOG_INFO,"copied header");
-  
-  ////////////////		
-
-  // declare stuff for host and GPU
-  unsigned char * d_data;
-  float * d_pulse;
-  unsigned char * h_bm0 = (unsigned char *)malloc(sizeof(unsigned char)*NTIMES_P*NCHAN_P);
-  cudaMalloc((void **)&d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char));
-  cudaMalloc((void **)&d_pulse, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float));
-  unsigned char * h_data = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P);
-  int * h_mask = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P);
-  int * d_mask;
-  cudaMalloc((void **)&d_mask, NBEAMS_P*NCHAN_P*sizeof(int));
-  int * h_tsmask = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P);
-  int * d_tsmask;
-  cudaMalloc((void **)&d_tsmask, NBEAMS_P*NTIMES_P*sizeof(int));
-  float * d_spec, * d_oldspec;
-  cudaMalloc((void **)&d_spec, NBEAMS_P*NCHAN_P*sizeof(float));
-  cudaMalloc((void **)&d_oldspec, NBEAMS_P*NCHAN_P*sizeof(float));
-  float * d_ts;
-  cudaMalloc((void **)&d_ts, NBEAMS_P*NTIMES_P*sizeof(float));
-  float * h_bpwr = (float *)malloc(sizeof(float)*NBEAMS_P);
-  float * d_bpwr;
-  cudaMalloc((void **)&d_bpwr, NBEAMS_P*sizeof(float));
-  float * h_spec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float * h_ts = (float *)malloc(sizeof(float)*NBEAMS_P*NTIMES_P);
-  float * h_beam = (float *)malloc(sizeof(float)*NBEAMS_P);
-  float * h_bmask = (float *)malloc(sizeof(float)*NBEAMS_P);
-  float * h_subspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float * h_var = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float * h_max = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float * h_pp = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float * h_oldspec = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float *h_spec0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float *h_var0 = (float *)malloc(sizeof(float)*NBEAMS_P*NCHAN_P);
-  float *d_spec0, *d_var0;
-  cudaMalloc((void **)&d_spec0, NBEAMS_P*NCHAN_P*naver*sizeof(float));
-  cudaMalloc((void **)&d_var0, NBEAMS_P*NCHAN_P*naver*sizeof(float));
-  for (int i=0;i<NBEAMS_P*NCHAN_P;i++) h_oldspec[i] = 0.;
-  cudaMemcpy(d_oldspec, h_oldspec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyHostToDevice);
-  float * d_var, * d_max, * d_pp;
-  cudaMalloc((void **)&d_var, NBEAMS_P*NCHAN_P*sizeof(float));
-  cudaMalloc((void **)&d_max, NBEAMS_P*NCHAN_P*sizeof(float));
-  cudaMalloc((void **)&d_pp, NBEAMS_P*NCHAN_P*sizeof(float));
-  int * h_idx = (int *)malloc(sizeof(int)*NBEAMS_P*NCHAN_P);
-  int * d_idx;
-  cudaMalloc((void **)&d_idx, NBEAMS_P*NCHAN_P*sizeof(int));
-  int * h_tsidx = (int *)malloc(sizeof(int)*NBEAMS_P*NTIMES_P);
-  int * d_tsidx;
-  cudaMalloc((void **)&d_tsidx, NBEAMS_P*NTIMES_P*sizeof(int));
-  int n_mask = 0;
-  int n_tsmask = 0;
-  float prev_tpwr = 0., tpwr = 0.;
-
-  // random numbers
-  unsigned char *d_repval;
-  cudaMalloc((void **)&d_repval, NTIMES_P*NCHAN_P*sizeof(unsigned char));
-  genrand<<<NTIMES_P*NCHAN_P/NTHREADS_GPU,NTHREADS_GPU>>>(d_repval,time(NULL));
-  for (int i=0;i<NBEAMS_P;i++) h_bpwr[i] = 1.;
-  syslog(LOG_INFO,"done with repvals");
-
-  // start control thread
-  /*int rval = 0;
-  pthread_t control_thread_id;
-  syslog(LOG_INFO, "starting control_thread()");
-  rval = pthread_create (&control_thread_id, 0, (void *) control_thread, (void *) &udpdb);
-  if (rval != 0) {
-    syslog(LOG_ERR, "Error creating control_thread: %s", strerror(rval));
-    return -1;
-    }*/
-  std::thread threadObj(control_thread, &udpdb);
-
-  
-  // for pre-start
-  unsigned char * tmp_indata = (unsigned char *)malloc(sizeof(unsigned char)*NBEAMS_P*NTIMES_P*NCHAN_P);
-  for (int i=0;i<NBEAMS_P;i++)
-    cudaMemcpy(tmp_indata, d_repval, NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-  int prestart = 2;
-  int gotDada = 0;
-  
-  int started = 0;
-  int blockn = 0;
-  
-  // put rest of the code inside while loop
-  while (1) {	
-    
-    // read a DADA block
-    if (prestart==0)  {
-      cin_data = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-      in_data = (unsigned char *)(cin_data);
-      gotDada=1;
-      blockn++;
-    }
-    else
-      in_data = (unsigned char *)(tmp_indata);
-
-    // deal with bm0
-    /*memcpy(h_data+NTIMES_P*NCHAN_P,in_data+NTIMES_P*NCHAN_P,(NBEAMS_P-1)*NTIMES_P*NCHAN_P);
-    memcpy(h_bm0,in_data,NTIMES_P*NCHAN_P);
-    memcpy(h_data,h_data+NTIMES_P*NCHAN_P,NTIMES_P*NCHAN_P);*/
-    
-
-    if (DEBUG) syslog(LOG_INFO,"read block");
-
-    /* 
-       if not first block, correct data
-       1 - measure spectrum
-       2 - measure varspec
-       if first block, proceed.
-       else
-       3 - measure maximum value
-       4 - use three spectra to derive channel flags
-       5 - flag
-     */
-
-    // copy data to device
-    cudaMemcpy(d_data, in_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyHostToDevice);
-    //cudaMemset(d_data, 8, NBEAMS_P*NTIMES_P*NCHAN_P);
-
-    // if not first block, correct data
-    if (started==1 || prestart==1) 
-      scaley<<<NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU,NTHREADS_GPU>>>(d_data, d_spec0, d_var0);
-
-    if (DEBUG) syslog(LOG_INFO,"copied data and scaled");
-    
-    // measure spectrum and varspec
-    calc_spectrum<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_spec);
-    calc_varspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_spec, d_var);
-    cudaMemcpy(h_spec, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_var, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
-    if (started==0) {
-      for (int i=0;i<NBEAMS_P;i++) {
-	for (int j=0;j<NCHAN_P;j++) prev_tpwr += h_spec[i*NCHAN_P+j];
-      }
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"done spec and var");
-    
-    // if not first block
-    if (started==1 || prestart==1) {
-
-      // do total power check
-      tpwr = 0.;
-      for (int i=0;i<NBEAMS_P;i++) {
-	for (int j=0;j<NCHAN_P;j++) tpwr += h_spec[i*NCHAN_P+j];
-      }
-
-      if (fabs(tpwr-prev_tpwr)/prev_tpwr >= mod_thresh) {
-
-	syslog(LOG_INFO,"mod_idx %f (threshold %f), noise replacement",fabs(tpwr-prev_tpwr)/prev_tpwr,mod_thresh);
-
-	for (int i=0;i<NBEAMS_P;i++)
-	  cudaMemcpy(d_data + i*NTIMES_P*NCHAN_P,d_repval,NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToDevice);	
-
-      }
-      else {
-             
-	// calc maxspec
-	calc_spectrum<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_max);
-	calc_ppspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU>>>(d_data, d_pp);
-
-	// derive channel flags
-	cudaMemcpy(h_max, d_max, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
-	cudaMemcpy(h_pp, d_pp, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToHost);
-	for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
-	  h_mask[i] = 0;
-	  h_subspec[i] = h_spec[i]-h_oldspec[i];
-	}
-	channflag(h_subspec,thresh,h_mask);
-	channflag(h_var,thresh+0.5,h_mask);
-	channflag(h_max,thresh,h_mask);
-	simple_channflag(h_pp,thresh,h_mask);
-	
-	// calc bpwr if needed
-	if (pwr) calc_bpwr(h_spec,h_bpwr);
-	cudaMemcpy(d_bpwr, h_bpwr, NBEAMS_P*sizeof(float), cudaMemcpyHostToDevice);
-	
-	// apply mask
-	gather_mask(h_idx, h_mask, &n_mask);
-	if (DEBUG) syslog(LOG_INFO,"FLAG_COUNT %d",n_mask);   		
-	cudaMemcpy(d_idx, h_idx, n_mask*sizeof(int), cudaMemcpyHostToDevice);
-	
-	// replace with random data
-	if (mkrand==1) {
-	  for (int i=0;i<NBEAMS_P;i++)
-	    cudaMemcpy(d_data + i*NTIMES_P*NCHAN_P,d_repval,NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToDevice);
-	}
-	
-	
-	// check whether we want to add pulse
-	if (dump_pending) {
-	  
-	  syslog(LOG_INFO, "adding pulse %s to beam %d", flnam, dumpbm);
-	  cudaMemset(d_pulse, 0, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(float));
-	  cudaMemcpy(d_pulse + dumpbm*NTIMES_P*NCHAN_P,pulsedata,NTIMES_P*NCHAN_P*sizeof(float), cudaMemcpyHostToDevice);
-	  sumpulse<<<NBEAMS_P*NTIMES_P*NCHAN_P/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_pulse);
-	  syslog(LOG_INFO, "added %s to beam %d", flnam, dumpbm);
-	  
-	  dump_pending=0;
-	  
-	}
-	
-	if (mkrand==0) 
-	  flag<<<n_mask*NTIMES_P/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_idx, d_repval, d_bpwr);
-	
-	// ts flagging if needed
-	if (tsflag) {
-	  
-	  make_ts<<<NBEAMS_P*NTIMES_P,NTHREADS_GPU>>>(d_data,d_ts);
-	  syslog(LOG_INFO,"made ts");
-	  cudaMemcpy(h_ts, d_ts, NBEAMS_P*NTIMES_P*sizeof(float), cudaMemcpyDeviceToHost);
-	  syslog(LOG_INFO,"copied ts");
-	  for (int i=0;i<NBEAMS_P*NTIMES_P;i++) 
-	    h_tsmask[i] = 0;
-	  simple_tsflag(h_ts,tsthresh,h_tsmask);
-	  syslog(LOG_INFO,"tsflagged");
-	  gather_tsmask(h_tsidx, h_tsmask, &n_tsmask);	
-	  syslog(LOG_INFO,"TS_COUNT %d",n_tsmask);   		
-	  cudaMemcpy(d_tsidx, h_tsidx, n_tsmask*sizeof(int), cudaMemcpyHostToDevice);
-	  flagts<<<n_tsmask*(NCHAN_P-256)/NTHREADS_GPU, NTHREADS_GPU>>>(d_data, d_tsidx, d_repval, d_bpwr);
-	  syslog(LOG_INFO,"flagged ts");
-	  
-	}
-      
-      }
-
-    }
-
-    // deal with tpwr
-    prev_tpwr = tpwr;
-    
-    // copy data to host and write to buffer
-    cudaMemcpy(h_data, d_data, NBEAMS_P*NTIMES_P*NCHAN_P*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-    
-    // deal with bm0
-    //memcpy(h_data,h_bm0,NTIMES_P*NCHAN_P);
-    
-    // close block after reading
-    if (prestart==0) {
-      ipcio_close_block_read (hdu_in->data_block, bytes_read);
-      if (DEBUG) syslog(LOG_DEBUG,"closed read block");		    
-      written = ipcio_write (hdu_out->data_block, (char *)(h_data), BUF_SIZE);
-      if (written < BUF_SIZE)
-	{
-	  syslog(LOG_ERR,"write error");
-	  return EXIT_FAILURE;
-	}
-    }
-
-    if (prestart==1) {
-      syslog(LOG_INFO,"Finishing with pre-start run-through");
-      prestart=0;
-
-      // search for spec0 and var0 file
-      if (f0=fopen("/home/ubuntu/data/specvar0.dat","r")) {
-
-	//f0=fopen("/home/ubuntu/data/specvar0.dat","r");
-	for (int i=0;i<NBEAMS_P*NCHAN_P;i++)
-	  fscanf(f0,"%f %f\n",&h_spec0[i],&h_var0[i]);
-	fclose(f0);
-	cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
-	cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
-	started=1;
-	syslog(LOG_INFO,"Read init weight from file");
-
-      }
-      
-    }
-
-    
-    // deal with started and oldspec
-    if (started==0 || prestart==2) {
-      if (gotDada==1 || prestart==2) {
-	if (blockn>0) {
-	  cudaMemcpy(d_spec0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
-	  cudaMemcpy(d_var0 + (blockn-1)*NBEAMS_P*NCHAN_P, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
-	}
-	if (blockn==0) {
-	  cudaMemcpy(d_spec0, d_spec, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
-	  cudaMemcpy(d_var0, d_var, NBEAMS_P*NCHAN_P*sizeof(float), cudaMemcpyDeviceToDevice);
-	}
-      }
-      if (prestart==0 && gotDada==1 && blockn >= naver) {
-	started=1;
-	if (naver>1) fix_zspec<<<NBEAMS_P*NCHAN_P, NTHREADS_GPU, 2*naver*sizeof(float)>>>(d_spec0, d_var0, naver);
-	cudaMemcpy(h_spec0, d_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost);
-	cudaMemcpy(h_var0, d_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyDeviceToHost);
-	median_calc(h_spec0);
-	median_calc(h_var0);
-	cudaMemcpy(d_spec0, h_spec0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
-	cudaMemcpy(d_var0, h_var0, NBEAMS_P*NCHAN_P*sizeof(float),cudaMemcpyHostToDevice);
-	syslog(LOG_INFO,"writing out weights...");
-	
-	// write out weights
-	f0=fopen("/home/ubuntu/data/specvar.dat","w");
-	for (int i=0;i<NBEAMS_P*NCHAN_P;i++)
-	  fprintf(f0,"%f %f\n",h_spec0[i],h_var0[i]);
-	fclose(f0);
-
-	
-      }
-      else if (prestart==2 && gotDada==0) {
-	prestart=1;
-	syslog(LOG_INFO,"Pre-starting");
-      }
-    }
-      
-    for (int i=0;i<NBEAMS_P*NCHAN_P;i++) {
-      h_oldspec[i] = h_spec[i];
-    }
-    
-    if (fwrite && prestart==0 && started==1) {
-      fout=fopen(fnam,"a");      
-      for (int i=0;i<NCHAN_P*NBEAMS_P;i++) fprintf(fout,"%d %g %g %g\n",h_mask[i],h_subspec[i],h_var[i],h_max[i]);
-      fclose(fout);
-    }
-    if (fwrite2) {
-      fout2=fopen(fnam2,"a");
-      for (int i=0;i<NBEAMS_P;i++) {
-	h_beam[i] = 0.;
-	h_bmask[i] = 0.;
-	for (int j=0;j<NCHAN_P;j++) {
-	  h_beam[i] += h_spec[i*NCHAN_P+j];
-	  h_bmask[i] += 1.*h_mask[i*NCHAN_P+j];
-	}
-	fprintf(fout2,"%g %g\n",h_beam[i],h_bmask[i]);
-      }
-      fclose(fout2);
-    }
-
-    if (DEBUG) syslog(LOG_INFO,"done with round");
-    
-
-  }
-
-  // close control thread
-  syslog(LOG_INFO, "joining control_thread");
-  quit_threads = 1;
-  threadObj.join();
-
-  free(fnam);
-  free(fnam2);
-  free(fout2);
-  free(h_data);
-  free(h_mask);
-  free(h_beam);
-  free(h_bmask);
-  free(h_spec);
-  free(h_var);
-  free(h_pp);
-  free(h_max);
-  free(h_bm0);
-  free(h_bpwr);
-  free(h_ts);
-  free(h_tsidx);
-  free(h_tsmask);
-  free(pulsedata);
-  cudaFree(d_ts);
-  cudaFree(d_tsidx);
-  cudaFree(d_tsmask);
-  cudaFree(d_bpwr);
-  cudaFree(d_max);
-  cudaFree(d_pp);
-  cudaFree(d_data);
-  cudaFree(d_spec);
-  cudaFree(d_var);
-  cudaFree(d_mask);
-  cudaFree(d_spec0);
-  cudaFree(d_var0);
-  cudaFree(d_pulse);
-  return 0;    
-} 
diff --git a/src/spectrometer_header.txt b/src/spectrometer_header.txt
deleted file mode 100644
index 88a535c..0000000
--- a/src/spectrometer_header.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-ACC_LEN      1                 
-BANDWIDTH    -250                   
-BW           -250                   
-CFREQ        1405                   
-CHAN_AV      0                      
-DEC          00:00:00.000           
-DSB          0                      
-FILE_SIZE    0                      
-FREQ         1405.000000            
-FSCRUNCH     1                      
-HDR_SIZE     4096                   
-HDR_VERSION  1.0                    
-INSTRUMENT   DSAX                   
-MODE         ACCUM                  
-NBEAM        1                      
-NBIT         8                       
-NCHAN        1024                   
-NDIM         1                      
-NPOL         1                      
-N_PROD       1                      
-OBSERVER     DSA                    
-OBS_OFFSET   0                      
-OBS_UNIT     SECONDS                
-OBS_VAL      0000.0000              
-PID          P000                   
-RA           00:00:00.000           
-RECEIVER     SANDY                  
-RESOLUTION   4096                   
-SOURCE       DSATEST                   
-STATE        Coherence              
-TELESCOPE    DSA110                 
-TSAMP        262.144                     
-TSCRUNCH     1                      
-TRANSFER_SIZE 256000000            
-NANT         5                     
-UTC_START    2015-08-07-17:07:28    
-FILE_NUMBER  0                      
-
diff --git a/src/splice_offline_beams b/src/splice_offline_beams
deleted file mode 100755
index 728af8c0f771d3c851b2b05a00201bb8672e361d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 32432
zcmeHwdw5jUx%b*L37KSanGnDrSO*+1D7kUJh$aX)@kB%}iWM9tlSwi%nZ&t3LOC^D
zwB|r6X{qISYz5m>slCYQ(NpTFr%_a_$F|s7<*5BiTiY|VDp;+eSaN>vTI-$JvuA>R
zp7TA=_s_Rs_FC&*zxTS<UVG2p@9f;Mw0W7$Fa&RQaiyTv2A85V=#E1PT|v<x=7~b=
zJ>n8E3hg{eQ~eF9K(E@)Xwh6cG~JGp_~zLZ!CFq#I4;>jD)V(;uHy6rMdeZmw<HiB
z+1hcDY@&3_3YQ199S3)+e&!=G@%3oF9?i$)X>Ayn>>v3?pVeBfwOoQe+PEYvZ7Y5s
zZ>9RY@v@m!#uZs`$$U3}kL>(gH$A*!rPeo6T=r@Exzz58+Hk04&aB#SYjrpjN%U0r
z%$rj^XI4$Dt7eAuo8%Mrn&oSxNi^PCl<XgaHqog6$;#NX+h1OL*+2cZ?^x~OS^b}T
z&2f%+`Ge};yln$=bPn`E(A+ub&7kGN*Ns4*HUhnT1p1XD(7Q&UPac8(H0W-8tlL}=
za@p@3fxdJE`Y%9t<73@cf<QJ|H$Mou^a{+208N8AqOGet7!f{SEbb3%^aVOL`r7=V
zun73W;jVy)#iL<#w*|sov7qSG#NJpu*eTjJM?>+TXp08@tq>6C@%!3B5q~(eB}nqR
zqoGK=jU=K)`bb4I?(6i2ATSyXiffvi7GLd~Q8Tk<R;r$^&8V3zd`+vb^R)(}!S)bT
zMps{Vb+{`MT<vcOlZy7vu80=uQ_!41#DM0sP5!g%lG{|~VGi5z5rU7m`1nvEtqgZ*
z+L%zNgpPzS=yu{l>bHse(WZ^pCtg=J&_u)5t=A!@yEWb6cFTUIbNl$kicV_;ABO}y
zVWKk+(UT^65fSuu+C(p9BFZx+dYOsdr|AWtQ5<-?+eD{zfVVv+I<IZix6ed3umAf^
zbo2Um&_pNQyd5&p5$M!**hHsehqoaUy_kt8pEJ=hfT``6iOyqAo!%X9I=s8`6P0e^
zy>lpT8$RaU@wDTJ7@qyd3*2IO%A44iUEF{gwL6IA<cVQyQ+`Wr3f;+Lvi&NxX$Vga
z$@b5wO(8saNVcD+Hihowe%XGC+7z;rdu01@YE!6A_R045s7)a{*)7`-Q=6v3$qlmo
zAhjt}Cs)Y!eblBYak3%B-?j&Bxi#L@c;k;y>uFrmxO&YhZ~wdhLAEy!#2nsjZ}z~>
zR`0I7DOI3(`*(WC@cN--8zg~ie|}T{)5#DT-u`Er26lS-(0$#&w>^D;U)TTN-u^s~
zci>Ke#o61x#N%k%HO@P*=n>4fACQCI{+HkVM%kB#CW@UyWq1A(J~r)m*3q=%nY!lr
z(>;k7n;~gGaLv1Rk-g2kcHB)|10@MaczinSRJQh?svl||c+_(Yl%x~ZH20rs9teB3
zCck?QrVT9d^enCaxwk*;SwUu05hYGFH`ThSR_TTH`$#$(UTTm=YBUfgOf?tPI;iF#
zfV2gTdDQ6B=sLo*di(P{+UUlWu=gb#d5`Z&+xx+i{?(q=`k}@fv`@)3($Lxe4q$Wt
z2g#{uJ`OxAc=A7JZos3XJ3YHmqe<=U&tEsrEs|x#71%53-d!bL&=B8V2$6wFp86~L
zca_wEE~QyS(Q%XfPr{^S<S7}8Q!>6KWqeD8<5cnrCGuBeEg~^D$hzd#OmU9^CvEZe
zpGs~}ya%kj@Qi{<{%R*_x+d#0McoEAc=srdUgcf66t@)vu&d;{44KLIKbDTq&7i6n
z?u2MMNt(i$JWB9%4206OCZ*{iDQ>H$4w`;9#_V}=x6<^J464%92Q4O_=#WgV&(w4;
z*x=I{465>Jk=9fNO}!a1lapmveg&*T=emr}^z@*~LI>(W(st@>hF?`m+uoG6pHcs=
zt)8_I=u8Q85bL9|EB`ydqVw$vv(wqy_7lstOl^;Y2fkgN(&pFNRzToaQUay6WwNW}
zb15p%`6ZsMbor?Vi&nwpg_&wrDK$UFWL6>k=p*Hn8*1ic2u%K3c9pm@sN@Y+2(m+~
zp|vUbuY^s{i8n_}Z$c?GU({+Uq2_1kOdnv$9@&-O3oJUnm(iJy9XUfIMXOG7X{NT%
zDs4te+Z3(M3xWQWKpDFtrL96~TaxPJ5UK-Ht+#4fOTO}9hHoyV?LHiJDumyse$0yo
zXnVJueUsYmmt7^lN>N!2mQ>SZCu*`gQ_VNQg9$M|rDlaz<AR#q83L2Dm6}Z%)O76R
z`cR%J{#qse*Er6U7w?=QFG^;R44mMwh7QY)rfo;5nS4?9FB*;h<QmkM>{{&!ul;o6
zj;D;qZ6DF0TXyFh8C;w=n*0Ad=^_TKV-A${L*8BY%hO@2cjiQSek{8aYJ_*&({$Fn
zWqsqi#`TT2Hu{cu2WCHs*m?V3NiGMcv;pfn!29zdPM%tlPk(^Z)-If6n){C?kD}r2
ze;fXK2R^T0r@j3j(e*~lJLeo-!kaJ8soS2SgX5Vq<=X`o2ieu<IyO8^6jzGk))Y4_
zAoHJ0{Lrfc;N9_Top*lR<4FAD$sI5(*$uVzZ#DJ5c&oSnx85CZoL;fIc_9CJTx;MQ
zwn`R@UI1oz?2Tn^@m|@*OQ>g2FB)sS1Npb3^v*xy?f-pzJT9Fsqb4q$O4bjbXsao^
zcqf7*#QG!TMY>HBf`9|FXGk&A&Vk?i<T<;y|FrkX<W*kdS?`PI;yB9k-zilBi%H-}
z^7<Bi2$)FJdUq^3O0_j)PqBB`qUX^{{`GzM>4gi==D&!xLFV+qEb;CUh=e}`Uw@LW
zZ&VvsL}zg==rUae;rx$FpqY?_+4T~ncn7w6^3umYt_kvzaq14N_T&L`2pB!;zfRsF
z`5*)HCQr_r#3)=^x^6j=JpGs9VXe>o%tz%*JjEGWE1~rXKp^o}t#wR>)-lO*)D5j;
zfLUPH`g_GURr3|6`I67eE?fd-8svsSK6J^bq{k~|-&xoPA5zzb5CHowQhNTQL`B7F
z->uZG?Yj?{R8*EKzPmJ^L)j-+oXccaPWyiJ-iY?~$8$vQ4&V_P_XySvQMb14<6@I_
z4=KJRXf)R5=<SwWIjx(7u00&DhP!ha^!B^bpnJ4nYq)An25mGOG)@_GQ>H<*{wB{#
zkvR?8gD$qmivVR0->u4?VQ22)dw{xifL;S8HGFp|zN4DYJbW8vS5A8bx?0{@?>+Sf
zj<B^EbIDsjB&WbLAJ#lwcHdCh(x>PvLM^^VJ#*GoT|leehc(Znd&W}Fhcz!qrZjmI
zLIQ^Tm$4$F`?++_I_mkb<|PZmGYBCu{Ig;>mS*@C^?X?KdIm$Hj=;3uqw6?>6Bl~1
zl6zCfQ`wzGFb@a$+9P`~{SFkV=z63mc}@Pz+46Os23&@C<dmQLXBB4;&@qW#bUP%-
za3?RM&Rr$9XekX!isT!9XL$HeJJh<ot7LvA0x^SY<*gnE^fVA2t%)A&R(s|pp9U)p
zZ37nMj}aR+w7W{)E5xe#5OwY<Nuo~kf1XL@Zge3Akc$08&&mJ7R}xy`0sn!9YYS8Q
zU8JABz)G*dS3ZWUX!g`4uQXW^F<B8ID<&x`TC~_YlNF=b3Ky(cCv6$yIXRKO9_=cr
zAHj~w3_HMyK#(=XWDUM1CJ&uX$K*Kd<(Qmsz?v^oC)R(|DJE4WYkJTnV^S%tA)BOR
zSFz{hP59E1EC4{Ccj)_fQ~xXU?Yntk%3b;LR9a2ts>c4mHub;XJni@LUc{4swI|z8
zLFb)s#ohI<@%gX0|D)#q_m}jaY8)Q(x_8GB!#n?1i9gc43ey$Hp7i?t?h&ExZ2*BP
zu5xT577h5K{@Y~P8oW*Tx2St*VvAo_BDmKkqAj9xV{0htuI_eEiTS3)gxuHCzO|+e
zw+c762=CIyB}-TOR;_Maxmv`!645}=7x8z}9m7CJC=&FAT17k<4#onw+enR8f86hj
z_jcp<VW6WPHzC{FL?qDRkHkczB@~Z|xV%}2!r$2~BB8bz>sY#c2^^5yX}X1h#qY}v
zvTXR7YQOf!hA+K5HR-IWzsi_kUl>Pc2|nJphlhtyIuOnz%1V^)VYXG_+GY>h^Kdal
zmwyLQdQf`lLK9^YWf&!0-_y5w2fm3sigF*y<5Xh4t;KQbz}3Tkl(daeg)p|P6h@EB
zm{45c*kcs9h&~aYJ-8gWTw|!njTd-!qwsxiDceP<YguXKwPnuDjy`eKg$pm6>6y&@
z6qZW(=pcgL+$h)KQwJZriPl)^+F`q<)NS(@mAV>B9gR-m2DuNPt-uVTWK5lHdj{s~
z=sNQWnCF0rBYyW}$mz?F^CtRE07EfP>)MuqaUrhFi2F#GO0doX<}ffHQp~R{-C%p8
zsMN98DK?<F0(}z^GZzJ8ajEMb+v3v7&)ebnK;Gigs@?gGrFD0WYAl_%qhM9(p#pn@
zQF^#wed$nv{R{c1HG*baR#Z9<u*TA=#ieerL(F0sFo>jB(;S$Gi|ZijCLK#tI=rOA
zOFFJAZHVR)Chs@Xy1M~#^wc3~|9|^`6al_(&-dvuG;-szShwY!Sk!4<p=}9GNNoCX
zVCs2*e#|NOn7mNIQqKiwEufA2|8j1)i`x7gkG{XtwjxiJboYR^!+NsNH)7h}QiX8p
zQkx^j>NXd+WRY8kzW>d0&Y>OTb%@T>v|XSIu|X?l`hIN~+dZUlbhnE(*8eHhO!<c;
zQ~C#Wk`+s*Y;Vx=u!M5I<ir_Mwy)IvINW8M{m|n7uNC~f*edW|?f5rz`ER;>R+lg9
z@(;RvUzekFfX3-^sxIf~(yHf^+Weg8nyar~;I3NJl8D3;?zuIyYwD_JCS<Grv-R_8
z>Sj$-RB;6)szFv7yY&O8oArRM(1yr`GKcnNw|YR1@iN{+fAMmR1f5Ct<73`zjyxJ8
z`*hkJUx>ZUQSb;Ah9eARHd{e80Jgk>M<sDo1VsBH)bpk+CW<Y8H&L!d<nk_UB6?mv
z9v}*PALf4Ev=xML<iCoMw@(8wZ#wC)mr`~209CgV=Kxjn9itvYQQ$a={iq*eXER16
zfRu#74`9df{J*S1T#6<^c>epJ0Ynth!JU7W;EtldKu`Y1RCg8K1;P1$rFvyiFXEMd
zj_U3rayWmO>Q&Acs4fg@)Hws#=Nqyy&sjwpY_iedY#>;kY<Qg&Br9JwRyhBel#G&%
zwa&kTt@#e)URbyRU#`$7GVVvC!}%83Rcw&K-Oi09%_SRg=W^0B+IRpxJ<eIw7-vuz
z`<zdZv<r-z(Ae&riG6;h@gSt_a0W@hMB{sC-0fURJX4H^0oda_Oy*P@6vlndA5o)5
zHugJd%IDY0#zE&#s8J^yhnz1_V}?-&p2N-<iJmEK8*;uuJ+lp3wVoFCv*;RC@C(3-
zPY~+9!tX$JW&VEhw`db#4^S1OL){OO=Z>NU<i$70aTi852wH)IWRx7mjyx!(2|?6b
z(U8>Vm|7vq-iHcTKgqkwAiC=XlGmW>tH77vDC-4|pQ7P-2ar*;D?bdzg3pl+<w+8M
zx2(PeD+~H%wGu)d7m{Hm9{@EEBQ#n#s!^0*1s#qFM8VYkr7Q|4zS6ojw581jI4bf^
z1L|^OMiicuL351*p=h%0PM``LG_T4JL6OrVtItq%2I(x{085;6Wpx%+7gE)J6?Vmy
zw^Fqk)Z+1U%-XlZm*NYzQ}uF)FP^X$)e2Y1FH6udW(|NPugGd4+9k&?BI;PG{#P~j
zqU8H9hKSPaGEwpr+SH@nDESH6i@0qUg_V%#D*45YAQg>*!je~i2Lxh5pz=S#iIU$@
zXZZ%~O5UJq3Hi4T6)DpY<r@)`(g$uQ6rD3l_e)owhn&&_bg-9q67?GrV591HCEx`D
zJVF2&>|e0N>%~|fP-UZzfKq7RaKH_S5M%pb9uvg@H_^mc`j%c+K=KOh&$-=16JswS
zc1_&P#386-;@wQV+k|}mfSb6**oRCqyBRs#guI`L#fTouJbu7U++ys%kev=?(uo6Z
zqKUDS2&st;%>H9iR5%U&h4vvP9y1a9nE1~mQzK9Bcauml_C6EQ%g8U1of>(FiJvyf
z+{47z$Rv$CcEC*{#n^o&cE^L7_<v07N!I=#;LLu8iI-x+loctD*0Q1p$WBc>!N^sl
zNE44S@?H~iKNIgUAvZ8_ADQG-LEgv2(WG4yS2K~a&dQ39!rDUn3MMWk-%FL9hYn~r
zYfZ$%OuQ6~Wn~I^nAz*dBuzZX!~$Y>(R_v?W{(iD{6@^ZLi=%zl+*nwbjSnf0zB0u
z6Gwj!ltTMy%~?V!|C>RE7`ux+u6RX)hRmQ67&C4m(8Fb7Y%hgi+{Y4if-QTMh!-f!
zdYE_$@{G~VU@o+GXkvkbPPwCqctIRH<7$eq^$Nte|FN++8+cOyU_|A6&}m#vi{X4U
z$8Uj8#<JsRV(YoEB^*mN_vcS|oqDe!G2cb^geS2xnyC2!Y7>#8B#djxirWk<XMxUE
zVYK`^dhLU!dq4Fofm>dX&HY<1MoSdnDtlr`7}t@x4}!;im5b=0szN4SOrZH;tRc*^
zz)WI{hKdOd6p|aM|BvXOd`T$KA&i@-jm`5eWIa`;8vHH6Sovb15L2oy!eAnDQzoLa
zj4CcvI;oO}${;FJU)QY9!s1JfHMULIuQuv0HEzgrJbJa~Qe$P_;E5i@rHYlj4CHBu
z2d>cm8NOhjdQuDejS{4}+;f4Xtq~$<`%ipRpqR9g{g1+eX|rL<0*z__(dg(y+@`Dm
z`5KJVl%=S|P?`Fg2ED0G{cOrqd(j>PmKNFv*c>tKtBA`VH3yx;jE!%T*oQ&B7t$^x
z+U46IyQ&;?+Rj4m3#d~V{$9$Sg)kWPVrsn!f!;#<eyvBw3alQ|#;7Gy8Z?H(_*x}3
z()n(ZwO?spW0HL>@Y&iECWO@fom|?tYE(*l=ULYNHfgu^EA2ZqSAiS%E4B7bh{@MY
zI#1-ZYRH5joAtz@VB9)E2=2a(gAqB3%j78ubd_U1Pq_`1xv0p<G5C50U-k_G(^v(7
znPlx(wx2RtuoL*~$d$oz+Gu2D<bGg@+|<i55qX1WdPdQp>XBEzFev3q*B48)FZZx7
zkFhTdPWuuiF!}Nu^2OS(e7V_V#|q%HeWCP#3}2qM`m#3@kvDichc67eT}cy0x35(D
zQi63zj>}9`<hU?6?aP$}CSQEOOtSVXUyhjUm<W8fFTXP(<hZ<O_2pb9B5yEh@nsw4
zPd<I|!WL15FmAsbi_nyP=-NfT450E|_JzUC4E7Wen0%oV(IjiX@}*v@pc$0R>B|i!
zg!JV#t1sb9MBd=OUOC6+sntbz$d{M~wX=EA57-yF0G7Vc*H<|%3~pwy=VkU~4KS0e
z{mPdIOm;j9eD=8f$b^u-oUr=xY9=CYaF4|oI`$cq@+EczCo_5MZzEsm*nfz9VQ|Wq
z%N`^!jms~{7i+)rWfYDP+Gt$*fY0`2stF-|`MuSb#!Q4fV`Uxt3`)(h_&vB2B7GTV
zU#8$1LyilB)4oIqOupOz%p_~S@}<*cM>Fu*z6_WU(w8@_z8uU%<P9FralAaDK`CDn
zw~U5)biDjM%#(9$Kl{SqlrNqI1SVfzAYZKg%9qznc2okN?aNscLi+M&t1shloblKK
zkvDj^WscGDa*+nLE37a!U0R`iIm5n8z@?fT7Y3(&8AV|7<r-inS^JeQO(r|u$mvVS
zgpj^`X!T`hCL(XJ&*BRmFAPfgviV&ykIt1bm?w{ye`H@6obu(epR+H|kT2GL<;x+H
z9fyF=9+zV#g!E<D>dT)p5qX2hEOU%5f*7=f$ED|XT=B_JzeDC(`;~cRSdM6;!})sP
zv(2kDA*6YE`j$Cgf>rUw9$lZ_t7XVb#6tU5v@4?OdXtzZ6e$0WzP|lir5G&8_T`H`
zV)`u_Mi;ZfLszJ8!g$)~TLf)JU@9*4<yGoMxXP4QsZFR%onpC4RacU9L2WOZuC7u)
z%jSrxf6!tp+;|oMfpE>f{90Ug;~HF4eOdQTM(?evS6-Eisu@~K4SF_fK70!iRejp>
z1?ad(b=bDzg6?h;{UJ@qh2HcHSUhQ4=EknTu^4;#MUcM8$ggjg8;bBNCfs9)>eX2K
zT!Y6DK-%PkG?d2uw_#9f=-IHba2g2PVPy?HB38{LHT_$3s|A;DHCz0;^((aGiEs#d
zX!EN=7!BvfA-=Rot;I!wT%@+5A`ff^FJrLh*94{m`yeorto>@XozyC5wmktn&9=cM
zrlo)CC{Sr5gq&@i)`MYECc-wjf%r2{=~WuiCq*GVS3QTvBGQwW*poBt34?E7u;-fu
zCQm%Y@Wk4$JXvS5;&Z@DPoyRM)uq>jke;+!J=vRyunn%w^5md~+|9;a`2|Rko~(dz
za!lf=$T4B?O$_$L2uz;rCr_;X%9G!kthfRAY)^(w2<gc>t0xyC0<`H-vkk7u@}y2f
z_9~<>7OlgSm7cu7o}6Y+7`&dro<#&EPbT6lHp$wrJn1%BF#-5&PX<f~>B(xVCkHbT
zwn4Ailj%R!-SVKVrdy+~FE!|caSt1Ne=BYWRDKResxkYAAbMeP<pcN%V$2>u^C2|l
zc&*ori<oilEHFwhpC>P^{mRQ0%~jyW-4X3&4C~A`O{Xy!4>{8NR<fRF<b5Ut^~Qjq
z)eg*$nYwOdX7hBM&P$H+d|?4R-$Um!+R~oWNmY4XL!JwqxRmELN714$Jks-@XvUQ1
zbv0m=o^J<clC@uXPN!Si$n$r!k>vS1nogcqKu)&jMJD$wo=-7#tzhPLGBD~}!gL;A
z>3Qw1VSsC|L03X`a)_(Q(K<Q2Ri8ms)Y8uN1>@@}$YcrGi<lXcX>}IWf5PN&eendY
zIcXbI1(tC-k*oMkB%Z?*Ke|tDOr&QI7vNMP<9ALorsDUh2pDDj763EJ+OOhg$E6f)
z6u)}lRU|~!WKE~|t<v<tjG430gs{Z#MpM@wW|r~O({B3h8lmF1fjqz0coNXMAnL}L
zAHt+d=qHQDI2phC4-u&Hg^z)3r+aL5HUvPm>z0F9E}xgweGAOx3+q4-Gk%Y{ZM=Mr
zWv6R~nb+a`4Z`=)IrAFqbmuIeFT(Yie}Guqg~m!c)z>eh#>J%Hz8(T*<JXXAKZSN3
z-G{YZ{4wsB%@K2HfcS}QH+pF^?tdFBGWy+!{=CP4u?@CX4d_uSa0_t&2#)u+HJ##p
zMAI|kUG<C!aU#X3j@IcsYh*m<(nRdj5gwH0kU_8FF}^%G{0s)kMYhJJ=gwC(coBm=
z4-=Ty)*FDCWbIdT=WCjaeCY(9eR)dL$(MUHJ;Rr&ubB{*x%0lM>jh@cSar(vp{;zS
z)y1tT_ptf*(+yO4rT#gXFR$JIz`kCg!6{!ayO+S^Yav2B$=a`cZPZ-k>&?KkuNyR-
zeEp23XZqS}LRfs=Yw9}8%o)eYBN`!noljHx`?x(~D>kUQU>0`tDM&78$AqL$!O?{e
z0<jOD(SmAMG-58(r(n13idQhB=~K~XEIf{2{S=?kf@)XbtReGW#!RqXu>f%Tj482S
zaSC+$jG1X$^geXaM;PVCl+p^Py>vq9nhVMe7tVk-k8nD#1PMpWI6To`;9Tfzg7tZt
z4srQqfZ)-=TtOWL)S`-Gs%#fl81_=9llTl!i%L>L=7GtnoB2%@)GTHdxIi|HrV8RH
zodh|x=q#%MD>>wvrDTBVs_{f9*97QQgwd1WO~nO9TJM;&UOehD<88Pu7$<cgtQXW6
zSB+O*UOK+1qTCo=&`|m1#pB5kJW6K&ZHUbHw2+Ai-i2s7RCK60A>B*hiRoTyUSw)Q
z{ijS|G%t1<Yc3#-ZYM@$bOBW-sVW{#IPrYqk~BXK^^1&pY3W+$%`(PQ@<n|GshQfp
z-GGF8@L&25+mTjUIk*_|tC&oJrxEu`C}i%-NIP!Z&w>zqcL7!?5!L3{WJIP0k<q@B
z+FCOZK0r)eI(Arr&HYUkZs#l+-x(=#KE<BNxKS#@j?u<pp*6|I?6g`e*l<kDt<j-K
z4ca_%gNAs1ji9*^DOJKEn9vI{NSI*@GYN#bB7=k(vM5b}!mB~CiJxLf4C8ER3HAo+
zg4(r-ShO}234{}^!CJi9!8d1?e$PXDBvC8VXe9(czOQW$1hOG>nOl6as9MU_jb#8c
z8nuyN9GSjxMZ~)U{9%?}i&SIW1L+QjkQR(gU%ZPV=xYi3J7YDVqbJ@P>Y7o5$6>PG
z8f-%yXGp10k>Nt9OI4ROH7`XCS1^(fRjf;~-k2{KjfU}>0EO!G$2(M4A_8Kos<#K@
zUEOhCpeq_wFcJ}oS0}KGC`yP*#5^AABn)JC;Qa={DC#)qVxji#Xject;&{bDyRWS)
z8V|LFf}{f;T+xyUh2zzs$d$@B!YDyd20v6C=L{R4`t4(k?eE>}xase1cI<E**lyqA
zSgTrhJ07rIHg>x`t_k*H$97GMIIfTw`)DFL9@%d1a9pb>m6Gz{cDtV`7fZ^4ir}^H
z+Avqu|7jJ-J2C7xIrdQL_!jMV(C%hQFLpdG%ZZM!%JQZ-6$<94rFteVW~;^FpF-r2
z6tmTv5wB#(uurtjHEc+sG7uPkCTxF77|U1h-5qz#`1Fle+AgoOO>^bh?5o>g_Yq;6
zZ#X8S`R{jJkw=2<_M^hqW#r*S52*j(j<}{iCv3R;L9|Eic!SD!WaH=J+nPw8*~uHb
z<9BkukoMn6QSC1Yhh(%re22u6b9RSBB~mCE8oT3{7T(dCwNu9Nf6;H7ES*ql<p?O7
zF)5UBu}?;58)&S}{#k7tvHz1bd@|y1w81(CztLmh_yZYlzwL4R1jpk$cG|DQ;9?T|
z(s%=t;ya&p?6G&_Q(<eOXxZMe*FG+1RV)~3^|f{-<eNj9uJ_SPOMKn&D8JJL?<t}8
zlK5lsIvnI%g1)%$#X4|gw<Ox|su5Zs<`K<Dohb0|-Ul%=eD(OT1eS*=y@BL<AJD!Q
zKVFqm+tJk-tW`OgwOBH4t8I<>>+9>PV;#Y8m>WI8NIMooxj@qIG3X{6-f+_Hmb*j*
z%S|NM>ZUwM`o#o(Cy|KbcNNet7nxARhh<j2r-j@j7>of%zh7nf8u!)p^(qzAUES%A
z209jPjD%6`?Cill(p5dPHPW>i)mSX3_A^v5Qx>rxm4KmpmLJtXPyHO~@prVL=<M3m
zLTvtE410og5U4v4?o5!D?yl}YBu;>qP6Y^W?nb4plVHH3GE3EZx~QY2Go+facCrsm
zG6%r86yV=PRP42pvuaY6dUS-hC~d$;<62=ym<)`zMkB2#+v?{8sED=4Xq046EQkoh
z)+m07LXq&dl9INbSQJH1%)g0Xvc#!_`Zx8=uAf2fLyu|%3970Al9lS{*0Anquq_ye
z{5aVe2)Bh3m|RkvluSm+3dK<gYP~&GZINb4wVNocDe<9HAQjK1RQyzX7W%vO&}ho0
zNF+q!0^!XqN>t}&e<-eJLm<2*80`YRdt*ZNMq9f>aH+c6AN6;-!}Zkx{GumVO-I3^
znTWo%gU;=$cMe9DduN#0>Sxu>tAq4-A`)EG9*hK|A^6|!uD%|Vtxdj%s2WF1JQ#>~
zMXT`!A{-E=9}M|?bZXv0zfDRXDst-6F+e1N_YmP0K$kos<$;bK95b!K7#+chIX-`D
zYqWk&x;}FTc*6enSn42_9Dzg>*9zf8XGDZZDXwAUOO1rDE8I#~GBI55U}#%AMJO_d
zghGE;cMIO6B-+AVeq0c>`?|ZxIJJb#oPjmOk9Q<>5q)zoieEd?br9I&!FJiCA3xDb
zf}?dK2=W3Y<7ZUrhf$rkwRW|{&@JE3#J}aD*Eqr9PJuO`8_NI*qU$AJHzu5H$#*u%
zg)c;!<%JVQAd1zFR=Re7*cTRZ;d5s!KO~8c1+}`3a4iS#;kk+=BHG#2DqjzEdm<PK
z^!noJwM`JDS4}KmhzqY|nXin(!Qqc$Bz^MrQa-#@${!OThG--qLis_X6zNVN{flU%
z9r5L%@JD)60T3ZtN+AxNz644Po?Om@kxd~?FcC<EF}4(Yxz^8_g<kz?tnMzvIemQz
zzvR$>NE0s5)fH{kbp#(HE>eNRIVCDu<Qc>Kqc{~_niWz1W}o~49L2pw|Jn_2%fkJu
z7Jsxi5R5=#K=?Mv*zt@bUv&sE&OU)tfSN_>Rb0|6IsM>_Je<<6_1f(3?)Jq3{)jJ<
z=xo9KqIl566%<9BAhbqIs1^!0{%beM6;Q)EK=pHE`1G+fkFYe0+5+-LVRXX7;VXQR
zU=PiGL|aCbESCqQJV&b2u@8%50K<bppl&(I)bZNZNxa?u)_PT7N>OX3DrTu-cKU~l
zzC<Ju3%1UngNBU31llAzgPnAg;^_DHhC}Tg@CP%Yolc_AMu%#mTTSYWA1H?DLO}lR
z5JPWz+ZkkHJmdkSz8Co7bR1$J=>$#d4tnET^p78LML}BBWR*VaA34f_M^w^3w&ZKx
zn{mH`Hbbo8DnH!I6Ym;p9#GxsJu|ka7*})kpK!m9Hbbo9>c46d$y)sYw{F2bE)C?W
zS?TI5rC-p>?oBCO!PRf4l&)j(EX$BTMpUZw{yr_>!L%T*eiP3bX*0x)TxEfFF}S~%
z8*DLfTxEq4r=+C!oB{b-%Fq>DWq$CT=t*;^8k_orHr0k3^;Qh0PoC1Ub78pe<MUw7
zV<V?}=CqOd7?dw4{WHr<E6gV9KM{lQTD|4R*E~GrE!SU7jozoZ+NK?b<=^h*#vQ3q
z_%v5}U>$_pn|F!v0FMigYr4m3(jl#uvrLObDw{ORLo3!0w_3#&>*PdE`h+-68qE^N
z7*6yAmQy^lu$;!pDUmsX`)jS5@?B}m6O`4c>ghcT;Cw_V<?P5Tr7V&2DbI&(!p$n$
z=wUr=(2hJrRZs6(D`yZwBIh$^X=jO?0eL>`ziO>drL`k-Fs)bBSSx24LLz4%W@%@M
zoU?d7>~uUirmaqwhFGBp={>9Dq(F${guyJ;ERItQ&xgHFtNiN7s#zsx)Il8Q_hqSO
zahwr&KJ5EiWj>x3(}oe38RM$Od)CT1aFECud0E<7BIod(54&D#4X3pu_l>o3$yzy2
z4iY(sE=xO0<gB~%VPDo-U)5!f5oeX0)&_B$@|LBV#c`6{`LIiH3z#;qE_0}6m7Eg>
zah#!+rJBWYPTTpg`?bnPbcqp1E|@YWy=SeQ@&$>UQ<kNjC2|(p`LHanR-XWX<z%od
zxtzAeSy{kx_SJ~k_w;F+R<7O$xWy6-u|Dmhs&P`n1f5r82tQXjMP!tC*Z!CCX%BF_
z3(q2G1C*<r)WJ{%Nbj??wB*z>I+vCf*7CGgY!jOW*E2MOJ*`vKvNZ>CYOctoIbhX{
zllZ|qa;gUZC#!DFsd`K<)vY;HAIhy7{8rVk>(NZ=(k2EsSTUSmc8ShS!t~-yu@U8Q
zZrS;;H)(md=n}fuT6N!``@Wvm&ADA!x>+7)gF*L**k5URze(%PI*Yxc`*=D-9%o}^
z>1KJHzjZ$BMm;-u;NZ8{>UU6McnU!t=TBwnW_g@(bv`U7*!XoS4!%cm_4yQO6<6tZ
zOth)5R@~wv1>c)gcY2>Kes#K2)i~3N)tW5eq^TT&IsNHHJcFbSTwGm{wwtR9GwkLT
z8}~Pf&$65Iq*$%V0#13#A(#_LY&!h{TwL9lBCX<TSd&z|xy4C6f0KB>icfma`AMwS
zWC5od<q*sd-*)Tl7;tg*)f8zJS6|a4WjD9j_Z;Fmi{`;p4AyYfnHu3$TrJilC7xU5
zX<*u0Rm&Ei6&6mtVYP6OtDHrX!*0%qX-Mf_!_{x5NUON|aE9I7;@p$JNqp8c<TM&q
z3%j|>88JBo^OthIQsD29@9V1z{y8Rple7kuf4azDU#;H_t$0pT&)471tX~wZc%E*2
zlIFWM{EgO%|Kv8kv3?$&l3O=B9-Z^wu`u`9@yI-$6Q7rQ-%KVxKlPrOO!}zQ`)D%h
z1%ltxl1X=n4S1k#-SB<@yb9GyMG}xGO1*E!+-nz3^#*h^5p}yL7JW!yvTk;~ou@%x
z1DgBn$Rp+GXVS}3@A1i`yTl4g{jh9yQJ(s{ALcGQ5~B`LQj}%0i;C2Hfy`ZYF(&ms
zpiKH${5=!5nI!9WF;4LNd(547dRvb>7rqkYT=X19LZDqB8#c#g7f6`PvDs5UgU;-<
z3*;K+*z5wyc{w(_xHt!u>~Uw&4N&sLJHyrv8m3(IOF*|_-1q4FoXj^x(~nu`^EBP9
zmwm=BN5v57c#<8o(RT?W$2jf;o%C+7$k`+D+4Er}`yY|`(c;Y<Ly*hgCxAzKTGqV!
zsg#pFuU^r3{yP@3iQOM%C3~L#6?7xVJgh)KM#J9*eZ25;bBUz0)O0x;^m4?r!R?kr
zWZTH?YL=rfb0}W4U8m{1nszI)SO-NUhhJ2|hyBgKyEVpK(EDQCIgXbE@Fa&{<$$K#
zK7)!rk(<Sc&ucut4}$IAtLbaqZi#1m?$h)Q7W&sTonMu}_=h#UTjN>&A<$j0-=*nn
z)b}->-^0Lip3w9@jc5APpy%@IB1z9)59xh}x$tj-PI0^2t^8#>-yZ=#T8|gMKZWrZ
zgPu$NOpV{8<+J_sCB8H(k9V=g@3Yv`r0Mrt=*zVnz6j^}wpQc!TlD%h{h)>3rsbT_
z$NQijnA;_tM=4!?)lhLeWRbH+%Rg+Pe`y3c-_-aa3;q!3G*5)4v*sr>{y9zO_|uz_
zXdI7O<h-Qm$1U{NHT?w3?^7DyWcq%$B=C6sS=0GFFHHXkbcIy>ye~wSAK7!-BBzY#
zNCb7TUd=F3;?w7A_Um%cX<gzkcKlPfxmx}ii~Oq?ufL=QwfyTPovllk0WC)ya7*F}
z-SuOXF4*sSP|-D6bZPvG1B%XZ_&3CX?BuJ$%QZtE@KTP&-`$`Sp08(ITF{`Dzd_4q
zd%hy^TBxZwr16I|p6z)|(+^wdk4rhFS-%x{R^x{>-lcW?Leq~~<h(BBXppHmO?Zs2
zb$yF#Imfje9^a2acboXRIt~DuKd)Qx^b#wgpRmxUfj*g;O=Xj`GyD2;sg`qE%h9Jj
zK`&z?`TUY2j^|CF=UU%JN{$Ug@gCenTO0CZ(|-la7w`1pF?$5rxOl4F<!cXjwIF?<
z71=T|pFhzf0$rWm;b0sIsdMMdpO+g%f2u3w^GBopULQU4??uWOUfAnvO>}l5Z31Z*
zsTOi3Lh%f!&$n!4<8@1Yco(maQaCaI$j?}E^YX^)ny$|5QF$vqAO4n{Z>d*H@Ge;?
zeAhHzzqqm4cm1+utCp_zt!`Y5gca~2S2o}a#1hg}nbFqJz<)A~{y3N=V}X(}WcH8D
zf6%EI=lcU<DkVhbOk@gHnK3FACiS3RXVj>-6Z?WP9Z#j6=u{k)`j9C}rRQYQ(yu<Y
zW@n^vvohb7tWprt-Q>MWJCeRqCW+1?QW*;}FNG2|r1kWIWtq;9kr*PQE5}6sMYc@g
zG7VCt7G`$n*D_N!1zzB+Q<2i|cs8d*a1ICpoRI*eQeDy+OBsK~O}+0K;$vMtybZdQ
z(kUn@gwnKql-!Y-bfZ%yO>>2&qR#2ewA`G4la&R7ETmX0<qono5V%x6hfD*q2%|(8
znIj`(s`E-S61!5b(^h{bFO%Qr!=JL_!-xLHUCuvTr*lOzGJ!Jw%wDF@R0>n-1>PAc
zRcSE&y**JA>+M8V5z2T}l^tB8WVC3vsEHs4ucmQvQ+3?mu3PwFK@BBrwN~RL-Zj+f
z@W(pv>+s$P<f#%#wX$bZFe<Zp%ncv<qQS7A7<8>W92Yfm*lJL%Y41WaPH97Uxp*W5
zP?tQ`YJwelQR+apQmUm`)#{`;xDE~eP9#s&D0U36sKLtEiPesN&EAq|2htyD$1<v$
zp-5X7*ZnOm(cmU-QC1_@5hbot=B?({l1%4}|4o&?<{9|lazJi;pNC8Q<Sb(&ta-Zx
z4Z0=5{Cq!%OSc(}I!;R&TRmE4e!g$Sr6&WO&A$Zwc*S`a>))p<T(bT7S`MEdSuB0e
zrbqwGe^^(z+@~F;hm}-iem?)+2pUe=DSm!`GM8_m&%9aVM|mLhE@IZt_qDj3teM$<
z#&Nk3{q$~QZu9*wE*mtz)qYJDTTr1fVSc_(#^sO(vVN=oJ2n4ejicYm(#9pf@0s}N
z8IYC#^Po{o_+c==@0v?}Sqy~Q$X4^#kA|6_@6T~LWCo+o*NI&2wea(OJTBQ@7Q<~W
zzhdF%>tinetnUglKkMc8*DU-?^u*#)CmAt3>-hPm=I8O_`-fa6Q=F>8b~E{3Ed18L
zGaP52F0JwZk%gb{M{?P~hG~AQe)`^0iVqz_wDEl*{`<pLJh!?0C3q;r%+G%ZwfiEH
zmY#2_m2NZtZ_q(=kNNpNDF1!u)B1qr`N@1dj{Cq%5a#Fmr5*YfBlVj%=I4^`dztxD
z-=E!zi}kP!=HvEhREWm<-EOz+a;ZX`-X@!=ohGN+e?g7(*QF?`!u-=Lbi&fc<vCMb
zcIe%Fx2iPgHW$pt?Gg~IH0tYhE5r(2s#Z33{0RI%ZcrREI51j(RoA5>@SnX}F<iq7
z-w6COHNQ1}i<=a~0n31~elD*ZLI3Y=Q2bYwYsDkzZyJGrazHU0%Vf`rHQ8Y~L?y==
z&7Y)`Beu!-%)p1;sU#w{LVPSCmh=-{vJg?rg28^PUi$8un}4NVcy6*VSeoL00P(di
AKmY&$

diff --git a/src/splice_offline_beams.c b/src/splice_offline_beams.c
deleted file mode 100644
index a70a258..0000000
--- a/src/splice_offline_beams.c
+++ /dev/null
@@ -1,132 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <arpa/inet.h>
-#include <sys/syscall.h>
-#include <syslog.h>
-
-#include <src/sigproc.h>
-#include <src/header.h>
-
-FILE *output;
-
-void send_string(char *string) /* includefile */
-{
-  int len;
-  len=strlen(string);
-  fwrite(&len, sizeof(int), 1, output);
-  fwrite(string, sizeof(char), len, output);
-}
-
-void send_float(char *name,float floating_point) /* includefile */
-{
-  send_string(name);
-  fwrite(&floating_point,sizeof(float),1,output);
-}
-
-void send_double (char *name, double double_precision) /* includefile */
-{
-  send_string(name);
-  fwrite(&double_precision,sizeof(double),1,output);
-}
-
-void send_int(char *name, int integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(int),1,output);
-}
-
-void send_char(char *name, char integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(char),1,output);
-}
-
-
-void send_long(char *name, long integer) /* includefile */
-{
-  send_string(name);
-  fwrite(&integer,sizeof(long),1,output);
-}
-
-void send_coords(double raj, double dej, double az, double za) /*includefile*/
-{
-  if ((raj != 0.0) || (raj != -1.0)) send_double("src_raj",raj);
-  if ((dej != 0.0) || (dej != -1.0)) send_double("src_dej",dej);
-  if ((az != 0.0)  || (az != -1.0))  send_double("az_start",az);
-  if ((za != 0.0)  || (za != -1.0))  send_double("za_start",za);
-}
-
-int main(int argc, char * argv[]) {
-
-  // memory
-  uint64_t bsize = 2013265920, bls = 94371840;
-  unsigned char * allbeams = (unsigned char *)malloc(sizeof(unsigned char)*bsize);  
-  memset(allbeams,0,bsize);
-  unsigned char * data = (unsigned char *)malloc(sizeof(unsigned char)*bls);  
-  FILE *fin;
-  
-  // load in data if present
-  for (int i=0;i<16;i++) {
-
-    if (strcmp(argv[i+1],"none")!=0) {
-    
-      fin=fopen(argv[i+1],"rb");
-      fread(data,sizeof(unsigned char),bls,fin);
-      fclose(fin);      
-      
-      for (int ibeam=0;ibeam<256;ibeam++) {
-	for (int itime=0;itime<15*512;itime++) {
-	  for (int ich=0;ich<48;ich++) {
-	    allbeams[ibeam*15*512*1024 + itime*1024 + i*48 + ich + 128] = data[itime*256*48 + ibeam*48 + ich];
-	  }
-	}
-      }
-    }
-    
-  }
-
-  // make files
-
-  char cmd[300], foutnam[400];
-  sprintf(cmd,"mkdir -p %s_%s",argv[17],argv[18]);
-  system(cmd);
-
-  for (int i=0;i<256;i++) {
-	  
-    sprintf(foutnam,"%s_%s/%s_%d.fil",argv[17],argv[18],argv[18],i);
-    output = fopen(foutnam,"wb");
-    
-    send_string("HEADER_START");
-    send_string("source_name");
-    send_string(argv[18]);
-    send_int("machine_id",1);
-    send_int("telescope_id",82);
-    send_int("data_type",1); // filterbank data
-    send_double("fch1",1530.0); // THIS IS CHANNEL 0 :)
-    send_double("foff",-0.244140625);
-    send_int("nchans",1024);
-    send_int("nbits",8);
-    send_double("tstart",55000.0);
-    send_double("tsamp",8.192e-6*8.*4.);
-    send_int("nifs",1);
-    send_string("HEADER_END");
-	  
-    fwrite(allbeams + i*15*512*1024,sizeof(unsigned char),15*512*1024,output);
-	  
-    fclose(output);
-	  
-  }
-
-  
-  free(allbeams);
-  free(data);
-
-}
diff --git a/src/test_read.c b/src/test_read.c
deleted file mode 100644
index 2b5730a..0000000
--- a/src/test_read.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-#define S 4096
-
-/* global variables */
-int DEBUG = 0;
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -b connect to bf hdu\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("test_read", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      return EXIT_FAILURE;
-    }
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer;
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-    for (int i=0;i<S;i++)
-      syslog(LOG_INFO,"TEST %d %hi",i,block[i]);
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  
-}
-
-
diff --git a/src/test_write.c b/src/test_write.c
deleted file mode 100644
index 32dd25d..0000000
--- a/src/test_write.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/* will reorder raw data for input to xgpu */
-#define __USE_GNU
-#define _GNU_SOURCE
-#include <sched.h>
-#include <time.h>
-#include <sys/socket.h>
-#include <math.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sched.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <syslog.h>
-
-
-#include "sock.h"
-#include "tmutil.h"
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "ipcio.h"
-// Forward declaration to keep compiler happy
-// Possible minor bug in PSRDada
-int ipcio_check_pending_sod (ipcio_t* );
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_capture.h"
-#include "dsaX_def.h"
-
-#include <x86intrin.h>
-#include <smmintrin.h>
-#include <immintrin.h>
-
-#define S 4096
-
-// data to pass to threads
-struct data {
-  char * in;
-  int n_threads;
-  int thread_id;
-  ipcio_t * out;
-};
-
-/* global variables */
-int DEBUG = 0;
-int cores[16] = {4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write);
-int dada_bind_thread_to_core (int core);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, int write)
-{
-
-  if (write==0) {
-  
-    if (dada_hdu_unlock_read (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock read on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-
-  if (write==1) {
-
-    if (dada_hdu_unlock_write (in) < 0)
-      {
-	syslog(LOG_ERR, "could not unlock write on hdu_in");
-      }
-    dada_hdu_destroy (in);
-
-  }
-  
-}
-
-void usage()
-{
-  fprintf (stdout,
-	   "dsaX_reorder_raw [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -t number of threads [default 4]\n"
-	   " -b connect to bf hdu\n"
-	   " -i input key [default CAPTURED_BLOCK_KEY]\n"
-	   " -o output key [default REORDER_BLOCK_KEY]\n"
-	   " -q quitting after testing\n"
-	   " -h print usage\n");
-}
-
-/* thread for data massaging */
-void * massage(void *args) {
-
-  // basic stuff
-  struct data *d = args;
-  int thread_id = d->thread_id;
-
-  
-  // set affinity
-  const pthread_t pid = pthread_self();
-  const int core_id = cores[thread_id];
-  cpu_set_t cpuset;
-  CPU_ZERO(&cpuset);
-  CPU_SET(core_id, &cpuset);
-  const int set_result = pthread_setaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (set_result != 0)
-    syslog(LOG_ERR,"thread %d: setaffinity_np fail",thread_id);
-  const int get_affinity = pthread_getaffinity_np(pid, sizeof(cpu_set_t), &cpuset);
-  if (get_affinity != 0) 
-    syslog(LOG_ERR,"thread %d: getaffinity_np fail",thread_id);
-  if (CPU_ISSET(core_id, &cpuset))
-    if (DEBUG) syslog(LOG_DEBUG,"thread %d: successfully set thread",thread_id);
-
-  // extract from input data structure
-  char *in = (char *)d->in;
-  //char *out = (char *)d->out;
-  int nthreads = d->n_threads;  
-  
-  // place in out
-  int i = thread_id*(S/nthreads);
-  //syslog(LOG_INFO,"thread %d: %d",thread_id,i);
-  memcpy (d->out->curbuf + i, in + i, S/nthreads);  
-  
-  /* return 0 */
-  int thread_result = 0;
-  pthread_exit((void *) &thread_result);
-  
-}
-
-
-// MAIN
-
-int main (int argc, char *argv[]) {
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("test_write", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-
-  // TESTING and initialization
-  // threads
-  struct data args[16];
-  pthread_t threads[16];
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-  void* result=0;
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-  dada_hdu_t* hdu_out2 = 0;
-
-  // data block HDU keys
-  key_t in_key = CAPTURED_BLOCK_KEY;
-  key_t out_key = REORDER_BLOCK_KEY;
-  key_t out_key2 = REORDER_BLOCK_KEY2;
-  
-  // command line arguments
-  int core = -1;
-  int nthreads = 1;
-  int bf = 0;
-  int arg = 0;
-  
-  while ((arg=getopt(argc,argv,"c:t:i:o:dbqh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }	  
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-	    {
-	      nthreads = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-
-	case 'd':
-	  DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-
-	case 'q':
-	  syslog (LOG_INFO, "Quit here");
-	  return EXIT_SUCCESS;
-	  
-	case 'b':
-	  bf=1;
-	  syslog (LOG_INFO, "Will write to bf dada hdu");
-	  break;
-
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-  
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-      
-      
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in,0);
-      dsaX_dbgpu_cleanup (hdu_out,1);
-      if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-      //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-      return EXIT_FAILURE;
-    }
-
-  if (bf) {
-    header_out = ipcbuf_get_next_write (hdu_out2->header_block);
-    if (!header_out)
-      {
-	syslog(LOG_ERR, "could not get next header2 block [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);      
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-    memcpy (header_out, header_in, header_size);
-    if (ipcbuf_mark_filled (hdu_out2->header_block, header_size) < 0)
-      {
-	syslog (LOG_ERR, "could not mark header block2 filled [output]");
-	dsaX_dbgpu_cleanup (hdu_in,0);
-	dsaX_dbgpu_cleanup (hdu_out,1);
-	if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);
-	//dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-	return EXIT_FAILURE;
-      }
-  }
-
-  
-  // record STATE info
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  uint64_t  bytes_read = 0;
-  char * block, * output_buffer, * blockie;
-  output_buffer = (char *)malloc(sizeof(char)*block_out);
-  memset(output_buffer,1,block_out);
-  uint64_t written, block_id;
-
-  // set up
-
-  int observation_complete=0;
-  int blocks = 0;
-  int started = 0;
-
-
-  
-  syslog(LOG_INFO, "starting observation");
-
-  while (!observation_complete) {
-
-    // open block
-    block = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    if (started==0) {
-      syslog(LOG_INFO,"now in RUN state");
-      started=1;
-    }
-
-    // DO STUFF
-
-    // sort out write
-    hdu_out->data_block->curbuf = ipcbuf_get_next_write ((ipcbuf_t*)hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 0;      
-    //blockie = ipcio_open_block_write (hdu_out->data_block, &block_id);
-    
-    // set up data structure
-    for (int i=0; i<nthreads; i++) {
-      args[i].in = output_buffer;
-      args[i].n_threads = nthreads;
-      args[i].thread_id = i;
-      args[i].out = hdu_out->data_block;
-    }
-
-    if (DEBUG) syslog(LOG_DEBUG,"creating %d threads",nthreads);
-    
-    for(int i=0; i<nthreads; i++){
-      if (pthread_create(&threads[i], &attr, &massage, (void *)(&args[i]))) {
- 	syslog(LOG_ERR,"Failed to create massage thread %d\n", i);
-      }
-    }
-
-    pthread_attr_destroy(&attr);
-    if (DEBUG) syslog(LOG_DEBUG,"threads kinda running");
-    
-    for(int i=0; i<nthreads; i++){
-      pthread_join(threads[i], &result);
-      if (DEBUG) syslog(LOG_DEBUG,"joined thread %d",i);
-    }
-    
-    // write to output
-
-    //written = ipcio_write (hdu_out->data_block, output_buffer, block_out);
-    
-    // finish write
-    ipcbuf_mark_filled ((ipcbuf_t*)hdu_out->data_block, block_out);
-    ipcio_check_pending_sod (hdu_out->data_block);
-    hdu_out->data_block->marked_filled = 1;      
-    //ipcio_close_block_write(hdu_out->data_block, block_out);
-    
-    if (DEBUG) syslog(LOG_DEBUG, "written block %d",blocks);      
-    blocks++;
-    
-
-    if (bytes_read < block_size)
-      observation_complete = 1;
-
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-
-  }
-
-  free(output_buffer);
-
-  dsaX_dbgpu_cleanup (hdu_in,0);
-  dsaX_dbgpu_cleanup (hdu_out,1);
-  if (bf) dsaX_dbgpu_cleanup (hdu_out2,1);	  
-  //dsaX_dbgpu_cleanup (hdu_in, hdu_out, hdu_out2);
-  
-}
-
-

From 75ee37b6eaa059047ce032d82add58dee686cc5a Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:48:15 -0700
Subject: [PATCH 12/30] Remove backups

---
 legacy/11_planar_complex_array.cu~ | 628 -----------------------------
 legacy/CMakeLists.txt~             | 120 ------
 legacy/dsaX_cutlass_interface.cu~  | 315 ---------------
 legacy/dsaX_cutlass_interface.h~   | 174 --------
 legacy/planar_complex.cu~          |  85 ----
 5 files changed, 1322 deletions(-)
 delete mode 100644 legacy/11_planar_complex_array.cu~
 delete mode 100644 legacy/CMakeLists.txt~
 delete mode 100644 legacy/dsaX_cutlass_interface.cu~
 delete mode 100644 legacy/dsaX_cutlass_interface.h~
 delete mode 100644 legacy/planar_complex.cu~

diff --git a/legacy/11_planar_complex_array.cu~ b/legacy/11_planar_complex_array.cu~
deleted file mode 100644
index 23722b0..0000000
--- a/legacy/11_planar_complex_array.cu~
+++ /dev/null
@@ -1,628 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Planar Complex Array Example
-
-  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
-  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
-  in global memory.
-
-  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
-  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
-  as either column-major or row-major layouts with a single leading dimension indicating the stride
-  between columns or rows.
-
-  The CUTLASS Library collects multiple template instantiations in a data structure and offers
-  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
-
-  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
-  are possible on the A and B operands:
-
-    n:  column-major
-    c:  column-major complex conjugate
-    t:  row-major
-    h:  row-major complex conjugate
-
-  To build strictly the planar complex kernels needed for general application, execute the following
-  CMake command in an empty build directory.
-
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
-
-  This builds all planar complex GEMM variants for Volta and Turing architectures.
-
-  To build strictly the kernels needed for this example, an even narrower filter string may be
-  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
-  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
-
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
-
-    $ make 11_planar_complex_array
-
-    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
-*/
-
-#include <iostream>
-#include <sstream>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/host_tensor_planar_complex.h"
-
-#include "cutlass/util/reference/device/tensor_fill.h"
-
-#include "cutlass/util/reference/device/gemm_planar_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-
-#include "cutlass/library/handle.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Result structure
-struct Result {
-
-  double runtime_ms;
-  double gflops;
-  cutlass::Status status;
-  cudaError_t error;
-  bool passed;
-
-  //
-  // Methods
-  //
-
-  Result(
-    double runtime_ms = 0,
-    double gflops = 0,
-    cutlass::Status status = cutlass::Status::kSuccess,
-    cudaError_t error = cudaSuccess
-  ):
-    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-  bool help;
-
-  cutlass::gemm::GemmCoord problem_size;
-  int batch_count;
-  cutlass::complex<float> alpha;
-  cutlass::complex<float> beta;
-
-  bool reference_check;
-  int iterations;
-  
-  Options():
-    help(false),
-    problem_size({1024, 1024, 1024}),
-    batch_count(1),
-    reference_check(true),
-    iterations(20),
-    alpha(1),
-    beta() { }
-
-  bool valid() {
-    return true;
-  }
-
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-    }
-
-    cmd.get_cmd_line_argument("m", problem_size.m());
-    cmd.get_cmd_line_argument("n", problem_size.n());
-    cmd.get_cmd_line_argument("k", problem_size.k());
-    cmd.get_cmd_line_argument("batch", batch_count);
-
-    cmd.get_cmd_line_argument("alpha", alpha.real());
-    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
-    cmd.get_cmd_line_argument("beta", beta.real());
-    cmd.get_cmd_line_argument("beta_i", beta.imag());
-    
-    cmd.get_cmd_line_argument("iterations", iterations);
-  }
-
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-
-    out << "11_planar_complex_array example\n\n"
-      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m=<int>                   GEMM M dimension\n"
-      << "  --n=<int>                   GEMM N dimension\n"
-      << "  --k=<int>                   GEMM K dimension\n"
-      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
-      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
-      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
-      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
-
-    out << "\n\nExamples:\n\n"
-      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
-
-    return out;
-  }
-
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const {
-
-    // Number of real-valued multiply-adds 
-    int64_t fmas = problem_size.product() * batch_count * 4;
-    
-    // Two flops per multiply-add
-    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Performance test environment for planar complex
-class TestbedPlanarComplex {
-public:
-
-  // Half-precision input and output
-  using Element = cutlass::half_t;
-
-  // Configurations for layouts and internal computation
-  using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = cutlass::layout::ColumnMajor;
-  using ElementCompute = float;
-  using ElementAccumulator = float;
-
-  //
-  // Data members
-  //
-
-  cutlass::library::Handle handle;
-
-  cutlass::gemm::GemmCoord problem_size;
-  int batch_count;
-  cutlass::DeviceAllocation<Element> tensor_A;
-  cutlass::DeviceAllocation<Element> tensor_B;
-  cutlass::DeviceAllocation<Element> tensor_C;
-  cutlass::DeviceAllocation<Element> tensor_D;
-  cutlass::DeviceAllocation<Element> tensor_D_ref;
-
-  cutlass::DeviceAllocation<void *> ptr_A_real;
-  cutlass::DeviceAllocation<void *> ptr_A_imag;
-  cutlass::DeviceAllocation<void *> ptr_B_real;
-  cutlass::DeviceAllocation<void *> ptr_B_imag;
-  cutlass::DeviceAllocation<void *> ptr_C_real;
-  cutlass::DeviceAllocation<void *> ptr_C_imag;
-  cutlass::DeviceAllocation<void *> ptr_D_real;
-  cutlass::DeviceAllocation<void *> ptr_D_imag;
-
-  //
-  // Methods
-  //
-
-  TestbedPlanarComplex(
-    Options const &options
-  ): 
-    problem_size(options.problem_size), batch_count(options.batch_count) {
-
-    // Allocate device memory for batched planar complex GEMM
-    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
-    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
-    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-
-    ptr_A_real.reset(batch_count);
-    ptr_A_imag.reset(batch_count);
-    ptr_B_real.reset(batch_count);
-    ptr_B_imag.reset(batch_count);
-    ptr_C_real.reset(batch_count);
-    ptr_C_imag.reset(batch_count);
-    ptr_D_real.reset(batch_count);
-    ptr_D_imag.reset(batch_count);
-
-  }
-
-  void initialize_rand() {
-
-    uint64_t seed = 1073;
-
-    // Use small integers to simplify correctness checking
-    int scope_max = 6;
-    int scope_min = -6;
-
-    cutlass::reference::device::BlockFillRandomUniform(
-        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
-
-    cutlass::reference::device::BlockFillRandomUniform(
-        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
-
-    cutlass::reference::device::BlockFillRandomUniform(
-        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
-  }
-
-  Result profile(Options const &options) {
-
-    Result result;
-
-    initialize();
-
-    Element *ptr_A = tensor_A.get();
-    Element *ptr_B = tensor_B.get();
-    Element *ptr_C = tensor_C.get();
-    Element *ptr_D = tensor_D.get();
-
-    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
-    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
-    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
-    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
-
-    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-
-
-    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
-    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
-    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
-    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
-    
-    //
-    // Configure pointers in global memory
-    //
-
-    struct {
-      Element *base;
-      void **ptr_real;
-      void **ptr_imag;
-      int64_t batch_stride;
-      int64_t imag_stride;
-    } tensors[] = {
-      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
-      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
-      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
-      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
-    };
-
-    for (auto const &tensor : tensors) {
-      for (int idx = 0; idx < batch_count; ++idx) {
-
-        void *ptr_real = tensor.base + idx * tensor.batch_stride;
-        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
-
-        cudaError_t error = cudaMemcpy(
-          tensor.ptr_real + idx,
-          &ptr_real,
-          sizeof(void *),
-          cudaMemcpyHostToDevice);
-
-        if (error != cudaSuccess) {
-          throw std::runtime_error("Failed to copy pointer to device memory");
-        }
-
-        error = cudaMemcpy(
-          tensor.ptr_imag + idx,
-          &ptr_imag,
-          sizeof(void *),
-          cudaMemcpyHostToDevice);
-
-        if (error != cudaSuccess) {
-          throw std::runtime_error("Failed to copy pointer to device memory");
-        }
-      }
-    }
-
-    //
-    // Construct events
-    //
-
-    cudaEvent_t events[2];
-
-    for (auto & event : events) {
-      result.error = cudaEventCreate(&event);
-      if (result.error != cudaSuccess) {
-        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
-        return -1;
-      }
-    }
-
-    // Record an event at the start of a series of GEMM operations
-    result.error = cudaEventRecord(events[0]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    //
-    // Run profiling loop
-    //
-
-    for (int iter = 0; iter < options.iterations; ++iter) {
-
-      //
-      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
-      // dispatch routines.
-      //
-      // Note, for planar complex array GEMM kernels, all numeric type arguments 
-      // specify the data type of the base real types. These are understood to
-      // apply to planar complex representations of matrices in memory and to complex<T>
-      // structures for scalars.
-      //
-      // See tools/library/include/cutlass/library/handle.h for more details.
-      //
-
-      result.status = handle.gemm_planar_complex_array(
-
-        problem_size.m(),                                 // expected GEMM M dimension
-        problem_size.n(),                                 // expected GEMM N dimension
-        problem_size.k(),                                 // expected GEMM K dimension
-        batch_count,                                      // Number of batched elements
-
-        nullptr,
-        nullptr,
-        nullptr,
-
-        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
-        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
-
-        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
-        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
-        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
-
-        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
-        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
-
-        lda,                                              // Leading dimension of real part of A matrix
-        lda,                                              // Leading dimension of imaginary part of A matrix
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
-        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
-        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
-
-        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
-        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
-
-        ldb,                                              // Leading dimension of real part of B matrix
-        ldb,                                              // Leading dimension of imaginary part of B matrix
-
-        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
-
-        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
-        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
-
-        ldc,                                              // Leading dimension of real part of C matrix
-        ldc,                                              // Leading dimension of imaginary part of C matrix
-
-        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
-        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
-
-        ldd,                                              // Leading dimension of real part of D matrix
-        ldd                                               // Leading dimension of imaginary part of D matrix
-      );
-
-      if (result.status != cutlass::Status::kSuccess) {
-        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
-        return result;
-      }
-    }
-    
-    //
-    // Stop profiling loop
-    //
-
-    // Record an event when the GEMM operations have been launched.
-    result.error = cudaEventRecord(events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    // Wait for work on the device to complete.
-    result.error = cudaEventSynchronize(events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    // Measure elapsed runtime
-    float runtime_ms = 0;
-    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    // Compute average runtime and GFLOPs.
-    result.runtime_ms = double(runtime_ms) / double(options.iterations);
-    result.gflops = options.gflops(result.runtime_ms / 1000.0);
-
-    // Cleanup
-    for (auto event : events) {
-      (void)cudaEventDestroy(event);
-    }
-
-    if (handle.get_last_operation()) {
-      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
-    }
-
-    //
-    // Compute reference in device code
-    //
-
-    if (options.reference_check) {
-
-      result.passed = true;
-
-      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
-        cutlass::reference::device::GemmPlanarComplex<
-          Element, LayoutA,
-          Element, LayoutB,
-          Element, LayoutC,
-          ElementAccumulator
-        >(
-          problem_size,
-          options.alpha,
-          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
-          cutlass::ComplexTransform::kConjugate,
-          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
-          cutlass::ComplexTransform::kNone,
-          options.beta,
-          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
-          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
-        );
-
-	Element epsilon = 0.1_hf;
-	Element nonzero_floor = 0.1_hf;
-	
-        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
-          tensor_D.get() + idx * batch_stride_D,
-          tensor_D_ref.get() + idx * batch_stride_D,
-          batch_stride_D,
-          epsilon,
-          nonzero_floor
-        );
-      }
-
-      if (result.passed) {
-        std::cout << "Reference check passed." << std::endl;
-      }
-      else {
-        std::cerr << "Error - reference check failed." << std::endl;
-      }
-    }
-
-    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << " GFLOPs: " << result.gflops << std::endl;
-
-    return result;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-int main(int argc, char const **args) {
-
-  //
-  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
-  //
-  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
-  //
-  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
-  //
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (props.major < 7) {
-    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
-              << std::endl;
-
-    // Returning zero so this passes on older architectures. Its actions are no-op.
-    return 0;
-  }
-  else if (props.major == 7 && props.minor <= 2) {
-    //
-    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
-    //
-    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
-      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
-      
-      // Returning zero so this passes on older Toolkits. Its actions are no-op.
-      return 0;
-    }
-  }
-  else if (props.major == 7 && props.minor >= 5) {
-    //
-    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
-    //
-    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-      
-      // Returning zero so this passes on older Toolkits. Its actions are no-op.
-      return 0;
-    }
-  }
-  else {
-    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
-    //
-    // fall through
-  }
-
-  //
-  // Parse options
-  //
-
-  Options options;
-  
-  options.parse(argc, args);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  // Execute one problem size
-  if (!options.valid()) {
-    std::cerr << "Invalid problem." << std::endl;
-    return -1;
-  }
-
-  TestbedPlanarComplex testbed(options);
-
-  Result result = testbed.profile(options);
-
-  return result.passed ? 0 : -1;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/legacy/CMakeLists.txt~ b/legacy/CMakeLists.txt~
deleted file mode 100644
index 0783d51..0000000
--- a/legacy/CMakeLists.txt~
+++ /dev/null
@@ -1,120 +0,0 @@
-enable_language(CUDA)
-
-include_directories(${PSRDada_SOURCE_DIR}/src)
-include_directories(${xGPU_SOURCE_DIR}/src)
-
-set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
-
-# DSA Fast Time Domain functions
-#-------------------------------
-add_executable(test_write test_write.c)
-target_link_libraries(test_write ${PSRDada_LIB})
-
-add_executable(test_read test_read.c)
-target_link_libraries(test_read ${PSRDada_LIB})
-
-add_executable(dsaX_trigger dsaX_trigger.c)
-target_link_libraries(dsaX_trigger ${PSRDada_LIB})
-
-add_executable(dsaX_filTrigger dsaX_filTrigger.c)
-target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
-
-# DMH: Has a 'sigproc' dependency, low priority
-if(0)
-  add_executable(splice_offline_beams splice_offline_beams.c)
-  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
-
-  add_executable(dsaX_writeFil dsaX_writeFil.c)
-  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
-  
-  add_executable(dsaX_splice dsaX_splice.c)
-  target_link_libraries(dsaX_splice ${PSRDada_LIB})
-
-  add_executable(gpu_flagger gpu_flagger.cu)
-  target_link_libraries(gpu_flagger ${PSRDada_LIB})
-endif()
-
-add_executable(dsaX_store dsaX_store.c)
-target_link_libraries(dsaX_store ${PSRDada_LIB})
-
-add_executable(dsaX_fluff dsaX_fluff.c)
-target_link_libraries(dsaX_fluff ${PSRDada_LIB})
-
-# DMH: intrinsics compilation error
-#add_executable(dsaX_reorder dsaX_reorder.c)
-#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
-
-# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
-#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
-#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
-add_executable(dsaX_nicdb dsaX_nicdb.c)
-target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
-
-add_executable(dsaX_dbnic dsaX_dbnic.c)
-target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
-
-add_executable(dsaX_capture dsaX_capture.c)
-target_link_libraries(dsaX_capture ${PSRDada_LIB})
-
-add_executable(dsaX_capture_thread dsaX_capture_thread.c)
-target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
-
-add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
-target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
-
-add_executable(dsaX_split dsaX_split.c)
-target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
-
-add_executable(dsaX_merge dsaX_merge.c)
-target_link_libraries(dsaX_merge ${PSRDada_LIB})
-
-add_executable(dsaX_simplesplit dsaX_simplesplit.c)
-target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
-
-add_executable(dsaX_fake dsaX_fake.c)
-target_link_libraries(dsaX_fake ${PSRDada_LIB})
-
-add_executable(dsaX_splitup dsaX_splitup.c)
-target_link_libraries(dsaX_splitup ${PSRDada_LIB})
-
-add_executable(dsaX_copydb dsaX_copydb.c)
-target_link_libraries(dsaX_copydb ${PSRDada_LIB})
-
-# DMH: fitsio dependency
-if(0)
-  add_executable(dsaX_writevis dsaX_writevis.c)
-  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
-endif()
-
-# DMH: XGPU dependencies
-add_executable(dsaX_wrangle dsaX_wrangle.c)
-target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB})
-
-add_executable(dsaX_testdada dsaX_testdada.c)
-target_link_libraries(dsaX_testdada ${PSRDada_LIB})
-
-add_executable(dsaX_xgpu dsaX_xgpu.cu)
-target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
-
-add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
-target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
-
-add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
-target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
-
-add_executable(fil2dada fil2dada.c)
-target_link_libraries(fil2dada ${PSRDada_LIB})
-
-add_executable(dumpfil dumpfil.c)
-target_link_libraries(dumpfil ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer dsaX_beamformer.cu)
-target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
-target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
-
-add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
-target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
-#------------------------------------------------------
diff --git a/legacy/dsaX_cutlass_interface.cu~ b/legacy/dsaX_cutlass_interface.cu~
deleted file mode 100644
index a51d5a2..0000000
--- a/legacy/dsaX_cutlass_interface.cu~
+++ /dev/null
@@ -1,315 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "dsaX_cutlass_interface.h"
-
-DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
-  problem_size(options.problem_size), batch_count(options.batch_count) {
-
-  // Allocate device memory for batched planar complex GEMM  
-  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
-  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
-  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
-  
-  ptr_A_real.reset(batch_count);
-  ptr_A_imag.reset(batch_count);
-  ptr_B_real.reset(batch_count);
-  ptr_B_imag.reset(batch_count);
-  ptr_C_real.reset(batch_count);
-  ptr_C_imag.reset(batch_count);
-  ptr_D_real.reset(batch_count);
-  ptr_D_imag.reset(batch_count);      
-}
-
-// DMH: Replace this with data from DSA-FTD
-void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
-
-  if(testing) {
-    uint64_t seed = 1234;
-    
-    // Use small integers to simplify correctness checking
-    int scope_max = 6;
-    int scope_min = -6;
-    
-    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
-    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
-    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
-  } else {
-    // DMH: construct DSA-FTD interface data transfer interface
-  }
-
-  ptr_A = tensor_A.get();
-  ptr_B = tensor_B.get();
-  ptr_C = tensor_C.get();
-  ptr_D = tensor_D.get();
-  
-  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
-  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
-  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
-  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
-  
-  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-  
-  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
-  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
-  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
-  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
-
-}
-
-Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
-  
-  Result result;
-  
-  initialize();  
-
-  // Configure pointers in global memory
-  struct {
-    Element *base;
-    void **ptr_real;
-    void **ptr_imag;
-    int64_t batch_stride;
-    int64_t imag_stride;
-  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
-		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
-		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
-		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
-  
-  for (auto const &tensor : tensors) {
-    for (int idx = 0; idx < batch_count; ++idx) {
-      
-      cudaError_t error;
-      void *ptr_real = tensor.base + idx * tensor.batch_stride;
-      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
-      
-      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
-      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
-      
-      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
-      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
-      
-    }
-  }
-
-  
-  cudaEvent_t events[2];  
-  for (auto & event : events) {
-    result.error = cudaEventCreate(&event);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return -1;
-    }
-  }
-  
-  // Record an event at the start of a series of GEMM operations
-  result.error = cudaEventRecord(events[0]);
-  if (result.error != cudaSuccess) {
-    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-    return result;
-  }
-
-  // Run profiling loop
-  //-------------------
-  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
-  // dispatch routines.
-  //
-  // Note, for planar complex array GEMM kernels, all numeric type arguments 
-  // specify the data type of the base real types. These are understood to
-  // apply to planar complex representations of matrices in memory and to complex<T>
-  // structures for scalars.
-  //
-  // See tools/library/include/cutlass/library/handle.h for more details.
-  //
-  for (int iter = 0; iter < options.iterations; ++iter) {
-    
-    result.status = handle.gemm_planar_complex_array(
-	problem_size.m(),                                 // expected GEMM M dimension
-	problem_size.n(),                                 // expected GEMM N dimension
-	problem_size.k(),                                 // expected GEMM K dimension
-	batch_count,                                      // Number of batched elements
-
-        nullptr,
-        nullptr,
-        nullptr,
-
-        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
-        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
-
-        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
-        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
-        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
-
-        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
-        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
-
-        lda,                                              // Leading dimension of real part of A matrix
-        lda,                                              // Leading dimension of imaginary part of A matrix
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
-        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
-        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
-
-        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
-        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
-
-        ldb,                                              // Leading dimension of real part of B matrix
-        ldb,                                              // Leading dimension of imaginary part of B matrix
-
-        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
-
-        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
-
-        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
-        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
-
-        ldc,                                              // Leading dimension of real part of C matrix
-        ldc,                                              // Leading dimension of imaginary part of C matrix
-
-        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
-        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
-
-        ldd,                                              // Leading dimension of real part of D matrix
-        ldd                                               // Leading dimension of imaginary part of D matrix
-						     );
-    
-    if (result.status != cutlass::Status::kSuccess) {
-      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
-      return result;
-    }
-  }
-  
-  // Record an event when the GEMM operations have been launched.
-  result.error = cudaEventRecord(events[1]);
-  if (result.error != cudaSuccess) {
-    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-    return result;
-  }
-  
-  // Wait for work on the device to complete.
-  result.error = cudaEventSynchronize(events[1]);
-  if (result.error != cudaSuccess) {
-    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
-    return result;
-  }
-  
-  // Measure elapsed runtime
-  float runtime_ms = 0;
-  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  if (result.error != cudaSuccess) {
-    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
-    return result;
-  }
-  
-  // Compute average runtime and GFLOPs.
-  result.runtime_ms = double(runtime_ms) / double(options.iterations);
-  result.gflops = options.gflops(result.runtime_ms / 1000.0);
-  
-  // Cleanup
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  
-  if (handle.get_last_operation()) {
-    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
-  }
-
-  // Compute reference in device code
-  if (options.reference_check) {
-    
-    result.passed = true;
-    
-    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
-      // Define the GEMM through templates
-      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
-	(problem_size, options.alpha,
-	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
-	 cutlass::ComplexTransform::kConjugate,
-	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
-	 cutlass::ComplexTransform::kNone,
-	 options.beta,
-	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
-	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
-	 );
-      
-      Element epsilon = 0.1_hf;
-      Element nonzero_floor = 0.1_hf;
-      
-      result.passed = BlockCompareRelativelyEqual
-	(
-	 tensor_D.get() + idx * batch_stride_D,
-	 tensor_D_ref.get() + idx * batch_stride_D,
-	 batch_stride_D,
-	 epsilon,
-	 nonzero_floor
-	 );
-    }
-    
-    if (result.passed) std::cout << "Reference check passed." << std::endl;
-    else std::cerr << "Error - reference check failed." << std::endl;
-  }
-  
-  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-  std::cout << " GFLOPs: " << result.gflops << std::endl;
-  
-  return result;
-}
-
- int main(int argc, char const **args) {
-  cudaDeviceProp props;
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-  
-  Options options;  
-  options.parse(argc, args);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  // Compute GEMM
-  testing = true;
-  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
-  Result result = gemm.run(options);
-  
-  return result.passed ? 0 : -1;
-}
-
diff --git a/legacy/dsaX_cutlass_interface.h~ b/legacy/dsaX_cutlass_interface.h~
deleted file mode 100644
index 42a3e8a..0000000
--- a/legacy/dsaX_cutlass_interface.h~
+++ /dev/null
@@ -1,174 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/host_tensor_planar_complex.h"
-#include "cutlass/util/reference/device/tensor_fill.h"
-#include "cutlass/util/reference/device/gemm_planar_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/library/handle.h"
-
-using namespace cutlass;
-using namespace gemm;
-using namespace library;
-using namespace layout;
-using namespace reference;
-using namespace device;
-
-// Result structure
-struct Result {
-
-  double runtime_ms;
-  double gflops;
-  Status status;
-  cudaError_t error;
-  bool passed;
-  
-  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
-    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
-};
-
-// Command line options parsing (testing)
-struct Options {
-
-  bool help;
-  GemmCoord problem_size;
-  int batch_count;
-  complex<float> alpha;
-  complex<float> beta;
-  bool reference_check;
-  int iterations;
-  
-  Options():
-    help(false),
-    problem_size({1024, 1024, 1024}),
-    batch_count(1),
-    reference_check(false),
-    iterations(20),
-    alpha(1),
-    beta() { }
-
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    
-    CommandLine cmd(argc, args);
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-    }
-    
-    cmd.get_cmd_line_argument("m", problem_size.m());
-    cmd.get_cmd_line_argument("n", problem_size.n());
-    cmd.get_cmd_line_argument("k", problem_size.k());
-    cmd.get_cmd_line_argument("batch", batch_count);
-
-    cmd.get_cmd_line_argument("alpha", alpha.real());
-    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
-    cmd.get_cmd_line_argument("beta", beta.real());
-    cmd.get_cmd_line_argument("beta_i", beta.imag());
-    
-    cmd.get_cmd_line_argument("iterations", iterations);
-  }
-
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-
-    out << "dsaX_cutlass_interface\n\n"
-	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
-	<< "Options:\n\n"
-	<< "  --help                      If specified, displays this usage statement.\n\n"
-	<< "  --m=<int>                   GEMM M dimension\n"
-	<< "  --n=<int>                   GEMM N dimension\n"
-	<< "  --k=<int>                   GEMM K dimension\n"
-	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
-	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
-	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
-	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
-	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
-	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
-    
-    return out;
-  }
-
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const {
-    
-    // Number of real-valued multiply-adds 
-    int64_t fmas = problem_size.product() * batch_count * 4;
-    
-    // Two flops per multiply-add
-    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
-  }
-};
-
-/// Performance test environment for planar complex
-class DSA_FTD_ComplexGEMM_CUTLASS {
-
-  // Half-precision input and output
-  using Element = half_t;
-  
-  // Configurations for layouts and internal computation
-  using LayoutA = ColumnMajor;
-  using LayoutB = ColumnMajor;
-  using LayoutC = ColumnMajor;
-  using ElementCompute = float;
-  using ElementAccumulator = float;
-
-  Handle handle;
-  
-  GemmCoord problem_size;
-  int batch_count;
-  DeviceAllocation<Element> tensor_A;
-  DeviceAllocation<Element> tensor_B;
-  DeviceAllocation<Element> tensor_C;
-  DeviceAllocation<Element> tensor_D;
-  DeviceAllocation<Element> tensor_D_ref;
-
-  DeviceAllocation<void *> ptr_A_real;
-  DeviceAllocation<void *> ptr_A_imag;
-  DeviceAllocation<void *> ptr_B_real;
-  DeviceAllocation<void *> ptr_B_imag;
-  DeviceAllocation<void *> ptr_C_real;
-  DeviceAllocation<void *> ptr_C_imag;
-  DeviceAllocation<void *> ptr_D_real;
-  DeviceAllocation<void *> ptr_D_imag;
-
-  Element *ptr_A;
-  Element *ptr_B;
-  Element *ptr_C;
-  Element *ptr_D;
-  
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_D;
-  
-  typename LayoutA::Stride::Index lda;
-  typename LayoutB::Stride::Index ldb;
-  typename LayoutC::Stride::Index ldc;
-  typename LayoutC::Stride::Index ldd;
-  
-  int64_t imag_stride_A;
-  int64_t imag_stride_B;
-  int64_t imag_stride_C;
-  int64_t imag_stride_D;
-
-  bool testing;
-  
-public:  
-  // Constructors
-  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
-  DSA_FTD_ComplexGEMM_CUTLASS();
-  
-  // Methods
-  void initialize();  
-  Result run(Options const &options);
-  
-  
-};
-  
diff --git a/legacy/planar_complex.cu~ b/legacy/planar_complex.cu~
deleted file mode 100644
index db94a64..0000000
--- a/legacy/planar_complex.cu~
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-#include <iostream>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/core_io.h>
-
-int main() {
-
-  cutlass::half_t x = 2.25_hf;
-
-  std::cout << x << std::endl;
-
-  return 0;
-}
-*/
-
-#include <cutlass/numeric_types.h>
-#include <cutlass/gemm/device/gemm.h>
-
-#include <cutlass/util/host_tensor.h>
-
-int main() {
-
-  // Define the GEMM operation
-  using Gemm = cutlass::gemm::device::Gemm<
-    cutlass::half_t,                           // ElementA
-    cutlass::layout::ColumnMajor,              // LayoutA
-    cutlass::half_t,                           // ElementB
-    cutlass::layout::ColumnMajor,              // LayoutB
-    cutlass::half_t,                           // ElementOutput
-    cutlass::layout::ColumnMajor,              // LayoutOutput
-    float,                                     // ElementAccumulator
-    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
-    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
-  >;
-
-  Gemm gemm_op;
-  cutlass::Status status;
-
-  //
-  // Define the problem size
-  //
-  int M = 512;
-  int N = 256;
-  int K = 128;
-
-  float alpha = 1.25f;
-  float beta = -1.25f;
-
-  //
-  // Allocate device memory
-  //
-
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
-
-  cutlass::half_t const *ptrA = A.device_data();
-  cutlass::half_t const *ptrB = B.device_data();
-  cutlass::half_t const *ptrC = C.device_data();
-  cutlass::half_t       *ptrD = C.device_data();
-
-  int lda = A.device_ref().stride(0);
-  int ldb = B.device_ref().stride(0);
-  int ldc = C.device_ref().stride(0);
-  int ldd = C.device_ref().stride(0);
-  //
-  // Launch GEMM on the device
-  //
- 
-  status = gemm_op({
-    {M, N, K},
-    {ptrA, lda},            // TensorRef to A device tensor
-    {ptrB, ldb},            // TensorRef to B device tensor
-    {ptrC, ldc},            // TensorRef to C device tensor
-    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
-    {alpha, beta}           // epilogue operation arguments
-  });
-
-  if (status != cutlass::Status::kSuccess) {
-    return -1;
-  }
-
-  return 0;
-}

From 0cad89c35b03f5b7ee01b672da07132a45cfc92f Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 22:50:15 -0700
Subject: [PATCH 13/30] Remove CPM from CMakeLists

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4328d9..451d6fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,6 @@ set(CMAKE_CXX_FLAGS "-pthread")
 
 # add a directory for cmake modules                                                                                                                                                                                
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
-include(cmake/CPM.cmake)
 
 # DSA_XENGINE may be built to run using CUDA. Future version may be
 # written for HIP or SYCL, which we call the

From 03b30e9cee9a863e18e7a2a622b27d6e6934d525 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 15 Jun 2024 23:17:10 -0700
Subject: [PATCH 14/30] clean up, add utils to install

---
 CMakeLists.txt     |   3 +-
 include/dsaX.h     |  82 ++++++++++++++++++++++
 src/dsaX_bfCorr.cu | 171 +++++++--------------------------------------
 3 files changed, 108 insertions(+), 148 deletions(-)
 create mode 100644 include/dsaX.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 451d6fe..4d276ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,8 +168,9 @@ else()
   find_package(xGPU REQUIRED)
 endif()
 
-# Add src, legacy
+# Add src, utils, legacy
 add_subdirectory(src)
+add_subdirectory(utils)
 option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
 if(DSA_XENGINE_BUILD_LEGACY)
   add_subdirectory(legacy)
diff --git a/include/dsaX.h b/include/dsaX.h
new file mode 100644
index 0000000..c7afcb0
--- /dev/null
+++ b/include/dsaX.h
@@ -0,0 +1,82 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+#include <iostream>
+#include <algorithm>
+using std::cout;
+using std::cerr;
+using std::endl;
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <string.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <time.h>
+#include <syslog.h>
+#include <pthread.h>
+
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+
+#include <cuda.h>
+#include "cuda_fp16.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+// define structure that carries around device memory
+typedef struct dmem {
+
+  // initial data and streams
+  char * h_input; // host input pointer
+  char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  
+  // correlator pointers
+  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times]
+  half * d_r, * d_i;
+  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
+  half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi;
+  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+  float * d_output;
+  
+  // beamformer pointers
+  char * d_big_input;
+  half * d_br, * d_bi;
+  half * weights_r, * weights_i; //weights: [arm, tactp, b]
+  half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b]
+  unsigned char * d_bigpower; //output: [b, tc]
+  float * d_scf; // scale factor per beam
+  float * d_chscf;
+  float * h_winp;
+  int * flagants, nflags;
+  float * h_freqs, * d_freqs;
+
+  // timing
+  float cp, prep, cubl, outp;
+  
+} dmem;
+
+// Initialise device memory
+void initialize(dmem * d, int bf);
+
+// Deallocate device memory
+void deallocate(dmem * d, int bf);
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+
+// ?
+int dada_bind_thread_to_core(int core);
diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu
index 25b9262..e992394 100644
--- a/src/dsaX_bfCorr.cu
+++ b/src/dsaX_bfCorr.cu
@@ -32,6 +32,7 @@ using std::endl;
 #include "dada_affinity.h"
 #include "ascii_header.h"
 #include "dsaX_def.h"
+#include "dsaX.h"
 
 #include <cuda.h>
 #include "cuda_fp16.h"
@@ -47,39 +48,6 @@ using std::endl;
 /* global variables */
 int DEBUG = 1;
 
-// define structure that carries around device memory
-typedef struct dmem {
-
-  // initial data and streams
-  char * h_input; // host input pointer
-  char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-  
-  // correlator pointers
-  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times]
-  half * d_r, * d_i;
-  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
-  half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi;
-  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
-  float * d_output;
-  
-  // beamformer pointers
-  char * d_big_input;
-  half * d_br, * d_bi;
-  half * weights_r, * weights_i; //weights: [arm, tactp, b]
-  half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b]
-  unsigned char * d_bigpower; //output: [b, tc]
-  float * d_scf; // scale factor per beam
-  float * d_chscf;
-  float * h_winp;
-  int * flagants, nflags;
-  float * h_freqs, * d_freqs;
-
-  // timing
-  float cp, prep, cubl, outp;
-  
-} dmem;
-
-
 // allocate device memory
 void initialize(dmem * d, int bf) {
   
@@ -161,9 +129,6 @@ void deallocate(dmem * d, int bf) {
   
 }
 
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-int dada_bind_thread_to_core (int core);
-
 void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
 {
 
@@ -181,7 +146,6 @@ void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
 
 } 
 
-
 void usage()
 {
 fprintf (stdout,
@@ -211,64 +175,11 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) {
 
 }
 
-
-// arbitrary transpose kernel
-// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
-// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
-// here, width is the dimension of the fastest index
-__global__ void transpose_matrix_char(char * idata, char * odata) {
-
-  __shared__ char tile[32][33];
-  
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
-}
-
-// arbitrary transpose kernel
-// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
-// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
-// here, width is the dimension of the fastest index
-__global__ void transpose_matrix_float(half * idata, half * odata) {
-
-  __shared__ half tile[32][33];
-  
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
-}
-
-// arbitrary transpose kernel
+// transpose kernel
 // assume breakdown into tiles of 32x32, and run with 32x8 threads per block
 // launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
 // here, width is the dimension of the fastest index
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) {
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
 
   __shared__ in_prec tile[32][33];
   
@@ -300,43 +211,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) {
 
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix_char<<<dimGrid,dimBlock>>>(input,tx);
-  /*
-  // set up for geam
-  cublasHandle_t cublasH = NULL;
-  cudaStream_t stream = NULL;
-  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  cublasSetStream(cublasH, stream);
-
-  // transpose input matrix into tx
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  const int m = NPACKETS_PER_BLOCK * NANTS;
-  const int n = NCHAN_PER_PACKET*2*2/8; // columns in output
-  const double alpha = 1.0;
-  const double beta = 0.0;
-  const int lda = n;
-  const int ldb = m;
-  const int ldc = ldb;
-  cublasDgeam(cublasH,transa,transb,m,n,
-	      &alpha,(double *)(input),
-	      lda,&beta,(double *)(tx),
-	      ldb,(double *)(tx),ldc);
-  */
-  // now we just need to fluff to half-precision
+  transpose_matrix<<<dimGrid,dimBlock>>>(input,tx);
   corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128,128>>>(tx,inr,ini);
-
-  // look at output
-  /*char * odata = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2);
-  cudaMemcpy(odata,inr,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,cudaMemcpyDeviceToHost);
-  FILE *fout;
-  fout=fopen("test.test","wb");
-  fwrite(odata,1,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4*2,fout);
-  fclose(fout);*/
-  
-  // destroy stream
-  //cudaStreamDestroy(stream);
-  
 }
 
 // kernel to help with reordering output
@@ -377,8 +253,8 @@ void reorder_output(dmem * d) {
 
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32);
-  transpose_matrix_float<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
-  transpose_matrix_float<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
+  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
+  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
 
   // look at output
   /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
@@ -617,7 +493,7 @@ __global__ void sum_beam(unsigned char * input, float * output) {
   __shared__ float summ[512];
   int bidx = blockIdx.x;
   int tidx = threadIdx.x;
-  int idx = bidx*256+tidx;
+  //int idx = bidx*256+tidx;
   int bm = (int)(bidx/48);
   int ch = (int)(bidx % 48);
 
@@ -675,7 +551,7 @@ void dbeamformer(dmem * d) {
   const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
   const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
   const int batchCount = NCHAN_PER_PACKET/8;
-  long long int i1, i2, o1;
+  long long int i1, i2;//, o1;
   
   // create streams
   cudaStream_t stream;
@@ -790,13 +666,13 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa
   int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
   int bm = (int)(idx / (128*(NANTS/2)));
   int tactp = (int)(idx % (128*(NANTS/2)));
-  int t = (int)(tactp / (32*(NANTS/2)));
+  //int t = (int)(tactp / (32*(NANTS/2)));
   int actp = (int)(tactp % (32*(NANTS/2)));
   int a = (int)(actp / 32);
   int ctp = (int)(actp % 32);
-  int c = (int)(ctp / 4);
+  //int c = (int)(ctp / 4);
   int tp = (int)(ctp % 4);
-  int t2 = (int)(tp / 2);
+  //int t2 = (int)(tp / 2);
   int pol = (int)(tp % 2);
   int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
   
@@ -843,18 +719,19 @@ void calc_weights(dmem * d) {
   cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
 
   // deal with antpos and calibs
-  int iant, found;
+  //int iant;
+  //int found;
   for (int i=0;i<NANTS;i++) {
     antpos_e[i] = d->h_winp[2*i];
     antpos_n[i] = d->h_winp[2*i+1];
   }
   for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
 
-    iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
-
-    found = 0;
-    for (int j=0;j<d->nflags;j++)
-      if (d->flagants[j]==iant) found = 1;
+    // DEBUG CODE?
+    //iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
+    //found = 0;
+    //for (int j=0;j<d->nflags;j++)
+    //if (d->flagants[j]==iant) found = 1;
 
     calibs[2*i] = d->h_winp[2*NANTS+2*i];
     calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1];
@@ -1087,7 +964,7 @@ int main (int argc, char *argv[]) {
   // test mode
   FILE *fin, *fout;
   uint64_t output_size;
-  char * output_data, * o1;
+  char * output_data;//, * o1;
   if (test) {
 
     // read one block of input data    
@@ -1135,7 +1012,7 @@ int main (int argc, char *argv[]) {
     // free
     free(d.h_input);
     free(output_data);
-    free(o1);
+    //free(o1);
     deallocate(&d,bf);
 
     exit(1);
@@ -1213,18 +1090,18 @@ int main (int argc, char *argv[]) {
   else
     syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
   uint64_t  bytes_read = 0;
-  char * block;
+  //char * block;
   char * output_buffer;
   output_buffer = (char *)malloc(block_out);
   uint64_t written, block_id;
   
   // get things started
   bool observation_complete=0;
-  bool started = 0;
+  //bool started = 0;
   syslog(LOG_INFO, "starting observation");
   int blocks = 0;
-  clock_t begin, end;
-  double time_spent;
+  //clock_t begin, end;
+  //double time_spent;
   
   while (!observation_complete) {
 

From e260867d558da934b996c25504c79b5d9c7bc1d7 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 21 Jun 2024 16:24:16 -0700
Subject: [PATCH 15/30] Add more dependencies, clean CMake defaults to cuBLAS
 only

---
 CMakeLists.txt     | 220 +++++++++++++++++++++++++++++++--------------
 README.md          |   4 +
 include/dsaX.h     |  44 +++++++--
 src/CMakeLists.txt |  62 +++++++------
 src/dsaX_bfCorr.cu |   9 +-
 5 files changed, 231 insertions(+), 108 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d276ea..2da1445 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,45 +25,21 @@ set(CMAKE_CXX_FLAGS "-pthread")
 # add a directory for cmake modules                                                                                                                                                                                
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
-# DSA_XENGINE may be built to run using CUDA. Future version may be
+# DSA_XENGINE may be built to run using CUDA or CPU. Future version may be
 # written for HIP or SYCL, which we call the
 # Target type. By default, the target is CUDA.
+#---------------------------------------------
+
+# Set by environment variable if visible
 if(DEFINED ENV{DSA_XENGINE_TARGET})
   set(DEFTARGET $ENV{DSA_XENGINE_TARGET})
 else()
   set(DEFTARGET "CUDA")
 endif()
 
-set(VALID_TARGET_TYPES CUDA) #HIP SYCL
-set(DSA_XENGINE_TARGET_TYPE
-  "${DEFTARGET}"
-  CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
-set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS CUDA)
-
-# CUDA specific part of CMakeLists
-#set(CMAKE_CUDA_EXTENSIONS OFF)
-find_package(CUDAToolkit REQUIRED)
-
-if(DEFINED ENV{DSA_XENGINE_GPU_ARCH})
-  set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH})
-else()
-  set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_70)
-endif()
-if(NOT DSA_XENGINE_GPU_ARCH)
-  message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}")
-endif()
-
-set(DSA_XENGINE_GPU_ARCH
-  ${DSA_XENGINE_DEFAULT_GPU_ARCH}
-  CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
-set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
-set(DSA_XENGINE_GPU_ARCH_SUFFIX
-  ""
-  CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
-set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
-#set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
-#mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
-#mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+set(VALID_TARGET_TYPES CUDA CPU) #HIP SYCL
+set(DSA_XENGINE_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
+set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS "CUDA" "CPU") # HIP SYCL
 
 string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE)
 list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
@@ -72,7 +48,8 @@ if(TARGET_TYPE_VALID LESS 0)
   message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
 endif()
 
-# Git
+# Git helpers
+#------------
 find_package(Git)
 if(GIT_FOUND)
   execute_process(
@@ -106,66 +83,173 @@ endif(GIT_FOUND)
 # Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch)
 include(ExternalProject)
 
-# Get TCC dependency
-option(DSA_XENGINE_USE_TCC "Use TensorCoreCorrelators for correlatorss" ON)
-if(DSA_XENGINE_USE_TCC)
-  option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
-  if(DSA_XENGINE_DOWNLOAD_TCC)
-    ExternalProject_Add(TCC
-      GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
-      #GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
-      )
+# Use FetchContent for lightweight dependencies
+include(FetchContent)
+
+# CUDA based dependencies and options
+#------------------------------------
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+
+  # CUDA specific part of CMakeLists
+  #set(CMAKE_CUDA_EXTENSIONS OFF)
+  find_package(CUDAToolkit REQUIRED)
+
+  # Get GPU architecture from environmen, or set default (sm_80)
+  if(DEFINED ENV{DSA_XENGINE_GPU_ARCH})
+    set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH})
   else()
-    find_package(libtcc REQUIRED)
+    set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_80)
   endif()
-endif()
   
-# Get CUTLASS dependency
-option(DSA_XENGINE_USE_CUTLASS "Use CUTLASS for GEMMs" ON)
-if(DSA_XENGINE_USE_CUTLASS)
-  option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
-  if(DSA_XENGINE_DOWNLOAD_CUTLASS)
-    # Custom CUTLASS build
-    ExternalProject_Add(NvidiaCutlass
-      GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
-      GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+  if(NOT DSA_XENGINE_GPU_ARCH)
+    message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}")
+  endif()
+  
+  set(DSA_XENGINE_GPU_ARCH
+    ${DSA_XENGINE_DEFAULT_GPU_ARCH}
+    CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
+  set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
+  set(DSA_XENGINE_GPU_ARCH_SUFFIX
+    ""
+    CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
+  set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
+  #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
+  #mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
+  #mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+  
+  # Set CUDA based methods and dependencies
+  #----------------------------------------
+  
+  # This is the default GPU method
+  option(DSA_XENGINE_ENABLE_CUBLAS "Use cuBLAS for correlatorss" ON)
+
+  # All other GPU methods can be enabled at compile time and
+  # toggled for use at run time, if enabled.
+  
+  # Get TCC dependency
+  option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF)
+  if(DSA_XENGINE_ENABLE_TCC)
+    option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
+    if(DSA_XENGINE_DOWNLOAD_TCC)
+      ExternalProject_Add(TCC
+	GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
+	#GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
+	CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(libtcc REQUIRED)
+    endif()
+  endif()
+  
+  # Get CUTLASS dependency
+  option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF)
+  if(DSA_XENGINE_ENABLE_CUTLASS)
+    option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
+    if(DSA_XENGINE_DOWNLOAD_CUTLASS)
+      # Custom CUTLASS build
+      ExternalProject_Add(NvidiaCutlass
+	GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+	GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+	CMAKE_ARGS
+	"-DCUTLASS_NVCC_ARCHS_ENABLED=89"
+	"-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex"
+	"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(NvidiaCutlass REQUIRED)
+    endif()
+  endif()
+  
+  # Get MAGMA dependency
+  option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF)
+  if(DSA_XENGINE_ENABLE_MAGMA)
+    option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF)
+    if(DSA_XENGINE_DOWNLOAD_MAGMA)
+      # Custom MAGMA build
+      ExternalProject_Add(Magma
+	URL https://icl.utk.edu/projectsfiles/magma/downloads/magma-2.8.0.tar.gz
+	CMAKE_ARGS
+	"-DMAGMA_ENABLE_CUDA=ON"
+	"-DGPU_TARGET=sm_80"
+	"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(Magma REQUIRED)
+    endif()
+  endif()
+  
+  # Get XGPU dependency (fix install)
+  option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF)
+  if(DSA_XENGINE_ENABLE_XGPU)
+    option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF)
+    if(DSA_XENGINE_DOWNLOAD_XGPU)
+      # Download, build and install
+      FetchContent_Declare(    
+	xGPU
+	GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
+	#GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
+	)
+      FetchContent_MakeAvailable(XGPU)
+    else()
+      # Find and link to local install
+      find_package(xGPU REQUIRED)
+    endif()
+  endif()
+
+endif() # CUDA functionality
+
+# Get CPU based dependencies
+# Get OPENBLAS dependency
+option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF)
+if(DSA_XENGINE_ENABLE_OPENBLAS)
+  option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF)
+  if(DSA_XENGINE_DOWNLOAD_OPENBLAS)
+    # Custom OPENBLAS build
+    ExternalProject_Add(Openblas
+      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+      GIT_TAG ce3f668
       CMAKE_ARGS
-      "-DCUTLASS_NVCC_ARCHS_ENABLED=89"
-      "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex"
+      #"-DOPENBLAS_ENABLE_CUDA=ON"
+      #"-DGPU_TARGET=sm_80"
       "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
       )
   else()
-    find_package(NvidiaCutlass REQUIRED)
+    find_package(Openblas REQUIRED)
   endif()
 endif()
 
-# Use FetchContent for lightweight dependencies
-include(FetchContent)
-
 # Get psrdada dependency
+option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for correlatorss" ON)
 option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
-if(DSA_XENGINE_DOWNLOAD_PSRDADA) 
+if(DSA_XENGINE_DOWNLOAD_PSRDADA)
+  # Download, build and install
   FetchContent_Declare(
     PSRDada
     GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code
-    #GIT_TAG        008afa70393ae2df11efba0cc8d0b95cda599c02
     )
   FetchContent_MakeAvailable(PSRDada)
 else()
+  # Find and link to local install
   find_package(psrdada REQUIRED)
 endif()
 
-# Get XGPU dependency
-option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build XGPU" ON)
-if(DSA_XENGINE_DOWNLOAD_XGPU) 
+# Get command line util
+
+# Get cli11 dependency
+# FIX ME: het static .hpp version and ship with package
+option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON)
+option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build Cli11" ON)
+if(DSA_XENGINE_DOWNLOAD_CLI11)
+  # Download, build and install
   FetchContent_Declare(
-    xGPU
-    GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
-    #GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
+    CLI11
+    GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+    GIT_TAG main
     )
-  FetchContent_MakeAvailable(XGPU)
+  FetchContent_MakeAvailable(CLI11)
 else()
-  find_package(xGPU REQUIRED)
+  # Find and link to local install
+  find_package(CLI11 REQUIRED)
 endif()
 
 # Add src, utils, legacy
diff --git a/README.md b/README.md
index 03fe5e3..4a27ba5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 # dsa110-xengine
 
+<p align="center">
+    <a href="https://github.com/dsa110/dsa110-xengine/commits/master"><img src="https://img.shields.io/github/last-commit/dsa110/dsa110-xengine" alt="GitHub last commit"></a>
+    <a href="https://github.com/dsa110/dsa110-xengine/commits/master"><img src="https://img.shields.io/github/commit-activity/y/dsa110/dsa110-xengine" alt="GitHub commit activity the past week"></a>
+</p>
 
 This repo contains code used for the DSA X-engine. The requirements are to:
  - capture SNAP F-engine packets on an ethernet interface, and place them in a psrdada buffer
diff --git a/include/dsaX.h b/include/dsaX.h
index c7afcb0..2fe6246 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -1,15 +1,10 @@
-// -*- c++ -*-
-/* assumes input and output block size is appropriate - will seg fault otherwise*/
-/*
-Workflow is similar for BF and corr applications
- - copy data to GPU, convert to half-precision and calibrate while reordering
- - do matrix operations to populate large output vector
- */
 #include <iostream>
 #include <algorithm>
+#include <complex>
 using std::cout;
 using std::cerr;
 using std::endl;
+#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
 #include <cmath>
@@ -32,6 +27,7 @@ using std::endl;
 #include "dada_affinity.h"
 #include "ascii_header.h"
 #include "dsaX_def.h"
+#include "dsaX_enums.h"
 
 #include <cuda.h>
 #include "cuda_fp16.h"
@@ -70,8 +66,40 @@ typedef struct dmem {
   
 } dmem;
 
+// Structure that carries BLAS parameters
+typedef struct dsaBLASParam_s {  
+  size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaBLASType blas_type;    /**< Type of BLAS computation to perfrom */
+  
+  // GEMM params
+  dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
+  dsaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
+  int m;                     /**< number of rows of matrix op(A) and C. */
+  int n;                     /**< number of columns of matrix op(B) and C. */
+  int k;                     /**< number of columns of op(A) and rows of op(B). */
+  int lda;                   /**< leading dimension of two-dimensional array used to store the matrix A. */
+  int ldb;                   /**< leading dimension of two-dimensional array used to store matrix B. */
+  int ldc;                   /**< leading dimension of two-dimensional array used to store matrix C. */
+  int a_offset;              /**< position of the A array from which begin read/write. */
+  int b_offset;              /**< position of the B array from which begin read/write. */
+  int c_offset;              /**< position of the C array from which begin read/write. */
+  int a_stride;              /**< stride of the A array in strided(batched) mode */
+  int b_stride;              /**< stride of the B array in strided(batched) mode */
+  int c_stride;              /**< stride of the C array in strided(batched) mode */
+  std::complex<double> alpha;             /**< scalar used for multiplication. */
+  std::complex<double>  beta;             /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
+  
+  // Common params
+  int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
+  dsaBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  
+} dsaBLASParam;
+  
+
 // Initialise device memory
-void initialize(dmem * d, int bf);
+void initialize_device_memeory(dmem * d, int bf);
 
 // Deallocate device memory
 void deallocate(dmem * d, int bf);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 748f00b..aabd03f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,32 +1,40 @@
 enable_language(CUDA)
 
-include_directories(..//include)
-include_directories(${PSRDada_SOURCE_DIR}/src)
-include_directories(${xGPU_SOURCE_DIR}/src)
-include_directories(${NvidiaCutlass_DIR}/../../../include)
-include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
-
-set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
-set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
-
-# Some simple CUTLASS examples to test linking/benching
-#------------------------------------------------------
-add_executable(planar_complex planar_complex.cu)
-target_link_libraries(planar_complex ${NvidiaCutlass_LIB})
-
-add_executable(10_planar_complex 10_planar_complex.cu)
-target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB})
-
-add_executable(11_planar_complex_array 11_planar_complex_array.cu)
-target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB})
-#------------------------------------------------------
-
-# DSA Fast Time Domain CUTLASS interface
-#---------------------------------------
-add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu)
-target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB})
-#---------------------------------------
+include_directories(../include)
+
+if(DSA_XENGINE_ENABLE_PSRDADA)
+  include_directories(${PSRDada_SOURCE_DIR}/src)
+  set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+endif()
+
+if(DSA_XENGINE_ENABLE_XGPU) 
+  include_directories(${xGPU_SOURCE_DIR}/src)
+  set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+endif()
+
+if(DSA_XENGINE_ENABLE_CUTLASS) 
+  include_directories(${NvidiaCutlass_DIR}/../../../include)
+  include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
+  set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
+
+  # Some simple CUTLASS examples to test linking/benching
+  #------------------------------------------------------
+  add_executable(planar_complex planar_complex.cu)
+  target_link_libraries(planar_complex ${NvidiaCutlass_LIB})
+  
+  add_executable(10_planar_complex 10_planar_complex.cu)
+  target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB})
+  
+  add_executable(11_planar_complex_array 11_planar_complex_array.cu)
+  target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB})
+  #------------------------------------------------------
+
+  # DSA Fast Time Domain CUTLASS interface
+  #---------------------------------------
+  add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu)
+  target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB})
+  #---------------------------------------  
+endif()
 
 # DSA Fast Time Domain
 #---------------------
diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_bfCorr.cu
index e992394..d1b7a70 100644
--- a/src/dsaX_bfCorr.cu
+++ b/src/dsaX_bfCorr.cu
@@ -49,7 +49,7 @@ using std::endl;
 int DEBUG = 1;
 
 // allocate device memory
-void initialize(dmem * d, int bf) {
+void initialize_device_memory(dmem * d, int bf) {
   
   // for correlator
   if (bf==0) {
@@ -337,7 +337,7 @@ void dcorrelator(dmem * d) {
   cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice);
 
   // reorder input
-  reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i);
+  reorder_input(d->d_input, d->d_tx, d->d_r, d->d_i);
 
   // not sure if essential
   cudaDeviceSynchronize();
@@ -771,7 +771,7 @@ void calc_weights(dmem * d) {
 
 int main (int argc, char *argv[]) {
 
-  cudaSetDevice(1);
+  cudaSetDevice(0);
   
   // startup syslog message
   // using LOG_LOCAL0
@@ -793,7 +793,6 @@ int main (int argc, char *argv[]) {
   int test = 0;
   char ftest[200], fflagants[200], fcalib[200];
   float sfreq = 1498.75;
-
   
   while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
     {
@@ -927,7 +926,7 @@ int main (int argc, char *argv[]) {
 
   // allocate device memory
   dmem d;
-  initialize(&d,bf);
+  initialize_device_memory(&d,bf);
 
   // set up for beamformer
   FILE *ff;

From 0656729e526569115a94f4c85cb1b3a940cd1733 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 21 Jun 2024 17:40:51 -0700
Subject: [PATCH 16/30] Add test directory, googletest, rename file

---
 CMakeLists.txt                                | 30 +++++++++++++++----
 src/CMakeLists.txt                            | 26 ++++++++--------
 ...fCorr.cu => dsaX_beamformer_correlator.cu} |  0
 3 files changed, 37 insertions(+), 19 deletions(-)
 rename src/{dsaX_bfCorr.cu => dsaX_beamformer_correlator.cu} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2da1445..f29ca32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -233,12 +233,10 @@ else()
   find_package(psrdada REQUIRED)
 endif()
 
-# Get command line util
-
-# Get cli11 dependency
-# FIX ME: het static .hpp version and ship with package
+# Get CLI11 dependency
+# FIX ME: get static .hpp version and ship with package
 option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON)
-option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build Cli11" ON)
+option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON)
 if(DSA_XENGINE_DOWNLOAD_CLI11)
   # Download, build and install
   FetchContent_Declare(
@@ -252,9 +250,29 @@ else()
   find_package(CLI11 REQUIRED)
 endif()
 
-# Add src, utils, legacy
+# Get CLI11 dependency
+# FIX ME: get static .hpp version and ship with package
+option(DSA_XENGINE_ENABLE_GOOGLETEST "Enable GOOGLETEST (required)" ON)
+option(DSA_XENGINE_DOWNLOAD_GOOGLETEST "Download and build GOOGLETEST" ON)
+if(DSA_XENGINE_DOWNLOAD_GOOGLETEST)
+  # Download, build and install
+  FetchContent_Declare(
+    GOOGLETEST
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG main
+    )
+  FetchContent_MakeAvailable(GOOGLETEST)
+else()
+  # Find and link to local install
+  find_package(GOOGLETEST REQUIRED)
+endif()
+
+
+# Add src, utils, include, tests, and legacy
 add_subdirectory(src)
 add_subdirectory(utils)
+add_subdirectory(include)
+add_subdirectory(tests)
 option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
 if(DSA_XENGINE_BUILD_LEGACY)
   add_subdirectory(legacy)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aabd03f..290b414 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,28 +38,28 @@ endif()
 
 # DSA Fast Time Domain
 #---------------------
-add_executable(dsaX_bfCorr dsaX_bfCorr.cu)
-target_link_libraries(dsaX_bfCorr ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+add_library(dsa OBJECT dsaX_beamformer_correlator.cu)
+
+add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu)
+target_link_libraries(dsaX_beamformer_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
 #---------------------
 
-# install step for header files
-#------------------------------
-set(DSA_XENGINE_HEADERS
+# install step for libraray
+#-----------------------------
+install(TARGETS
   # cmake-format: sortable
-  dsaX_capture.h
-  dsaX_capture_manythread.h
-  dsaX_capture_pcap.h
-  dsaX_def.h
-  dsaX_cutlass_interface.h
+  dsa
+  LIBRARY DESTINATION
+  lib
   )
-install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
-#------------------------------
+#-----------------------------
+
 
 # install step for executables
 #-----------------------------
 install(TARGETS
   # cmake-format: sortable
-  dsaX_bfCorr
+  dsaX_beamformer_correlator
   RUNTIME DESTINATION
   bin
   )
diff --git a/src/dsaX_bfCorr.cu b/src/dsaX_beamformer_correlator.cu
similarity index 100%
rename from src/dsaX_bfCorr.cu
rename to src/dsaX_beamformer_correlator.cu

From a50da91b41124c0ddfc7fb13e2ff6822e45561b3 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 21 Jun 2024 20:55:21 -0700
Subject: [PATCH 17/30] Restructure headers, create dsalib, move executable out
 to tests

---
 include/dsaX.h                    |  29 +-
 include/dsaX_def.h                |   6 +-
 src/CMakeLists.txt                |  25 +-
 src/dsaX_beamformer_correlator.cu | 586 ++----------------------------
 4 files changed, 68 insertions(+), 578 deletions(-)

diff --git a/include/dsaX.h b/include/dsaX.h
index 2fe6246..ddd351a 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -1,9 +1,8 @@
+#pragma once 
+
 #include <iostream>
 #include <algorithm>
 #include <complex>
-using std::cout;
-using std::cerr;
-using std::endl;
 #include <vector>
 #include <stdio.h>
 #include <stdlib.h>
@@ -18,21 +17,17 @@ using std::endl;
 #include <syslog.h>
 #include <pthread.h>
 
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
-#include "dsaX_def.h"
-#include "dsaX_enums.h"
+#include "dsaX_cuda_headers.h"
+#include "dsaX_psrdada_headers.h"
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
 
-#include <cuda.h>
-#include "cuda_fp16.h"
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
+/* global variables */
+//#define DEBUG;
 
 // define structure that carries around device memory
 typedef struct dmem {
diff --git a/include/dsaX_def.h b/include/dsaX_def.h
index 415e83b..c23ed15 100644
--- a/include/dsaX_def.h
+++ b/include/dsaX_def.h
@@ -1,5 +1,4 @@
-#ifndef __DSAX_DEF_H
-#define __DSAX_DEF_H
+#pragma once
 
 #include "dada_def.h"
 
@@ -93,6 +92,3 @@
 #define NBMS 256
 #define P_SIZE 4108
 #define NWAIT 100000
-
-#endif 
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 290b414..a55c8fe 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,21 +2,32 @@ enable_language(CUDA)
 
 include_directories(../include)
 
+# DSA Fast Time Domain library
+#-----------------------------
+add_library(dsa SHARED dsaX_cuda_interface.cu dsaX_blas_interface.cu dsaX_beamformer_correlator.cu)
+
+if(CUDAToolkit_FOUND)
+  target_link_libraries(dsa CUDA::cudart)
+endif()
+
 if(DSA_XENGINE_ENABLE_PSRDADA)
   include_directories(${PSRDada_SOURCE_DIR}/src)
   set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+  target_link_libraries(dsa ${PSRDada_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_XGPU) 
   include_directories(${xGPU_SOURCE_DIR}/src)
   set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+  target_link_libraries(dsa PUBLIC ${XGPU_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_CUTLASS) 
   include_directories(${NvidiaCutlass_DIR}/../../../include)
   include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
   set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
-
+  target_link_libraries(dsa PUBLIC ${NvidiaCutlass_LIB})
+  
   # Some simple CUTLASS examples to test linking/benching
   #------------------------------------------------------
   add_executable(planar_complex planar_complex.cu)
@@ -36,12 +47,10 @@ if(DSA_XENGINE_ENABLE_CUTLASS)
   #---------------------------------------  
 endif()
 
-# DSA Fast Time Domain
-#---------------------
-add_library(dsa OBJECT dsaX_beamformer_correlator.cu)
-
-add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu)
-target_link_libraries(dsaX_beamformer_correlator ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+if(CUDAToolkit_FOUND)
+  #add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu)
+  #target_link_libraries(dsaX_beamformer_correlator ${dsa} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+endif()
 #---------------------
 
 # install step for libraray
@@ -59,7 +68,7 @@ install(TARGETS
 #-----------------------------
 install(TARGETS
   # cmake-format: sortable
-  dsaX_beamformer_correlator
+  #dsaX_beamformer_correlator
   RUNTIME DESTINATION
   bin
   )
diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu
index d1b7a70..c91c1b7 100644
--- a/src/dsaX_beamformer_correlator.cu
+++ b/src/dsaX_beamformer_correlator.cu
@@ -5,340 +5,61 @@ Workflow is similar for BF and corr applications
  - copy data to GPU, convert to half-precision and calibrate while reordering
  - do matrix operations to populate large output vector
  */
-#include <iostream>
-#include <algorithm>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
 
-#include "dada_client.h"
-#include "dada_def.h"
-#include "dada_hdu.h"
-#include "multilog.h"
-#include "ipcio.h"
-#include "ipcbuf.h"
-#include "dada_affinity.h"
-#include "ascii_header.h"
 #include "dsaX_def.h"
 #include "dsaX.h"
+#include "dsaX_blas_interface.h"
 
-#include <cuda.h>
-#include "cuda_fp16.h"
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
+//#include <cuda.h>
+//#include "cuda_fp16.h"
+//#include <cublas_v2.h>
+//#include <cuda_runtime.h>
 
-// required to prevent overflow in corr matrix multiply
-#define halfFac 4
+#include "dsaX_cuda_interface.h"
 
-// beam sep
-#define sep 1.0 // arcmin
-
-/* global variables */
 int DEBUG = 1;
 
-// allocate device memory
-void initialize_device_memory(dmem * d, int bf) {
-  
-  // for correlator
-  if (bf==0) {
-    cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-    cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-    cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-  }
-
-  // for beamformer
-  if (bf==1) {
-    cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-    cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-    cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
-    cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
-    cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
-
-    // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
-    d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
-    d->flagants = (int *)malloc(sizeof(int)*NANTS);
-    d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
-
-    // timers
-    d->cp = 0.;
-    d->prep = 0.;
-    d->outp = 0.;
-    d->cubl = 0.;
-    
-  }
-  
-}
-
-// deallocate device memory
-void deallocate(dmem * d, int bf) {
-
-  cudaFree(d->d_input);
-
-  if (bf==0) {
-    cudaFree(d->d_r);
-    cudaFree(d->d_i);
-    cudaFree(d->d_tx);
-    cudaFree(d->d_output);
-    cudaFree(d->d_outr);
-    cudaFree(d->d_outi);
-    cudaFree(d->d_tx_outr);
-    cudaFree(d->d_tx_outi);
-  }
-  if (bf==1) {
-    cudaFree(d->d_tx);
-    cudaFree(d->d_br);
-    cudaFree(d->d_bi);
-    cudaFree(d->weights_r);
-    cudaFree(d->weights_i);
-    cudaFree(d->d_bigbeam_r);
-    cudaFree(d->d_bigbeam_i);
-    cudaFree(d->d_bigpower);
-    cudaFree(d->d_scf);
-    cudaFree(d->d_chscf);
-    free(d->h_winp);
-    free(d->flagants);
-    cudaFree(d->d_freqs);
-    free(d->h_freqs);
-  }
-  
-}
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out)
+void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out)
 {
-
-  if (dada_hdu_unlock_read (in) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock read on hdu_in");
-    }
+  if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in");
   dada_hdu_destroy (in);
-
-  if (dada_hdu_unlock_write (out) < 0)
-    {
-      syslog(LOG_ERR, "could not unlock write on hdu_out");
-    }
-  dada_hdu_destroy (out);
-
-} 
-
-void usage()
-{
-fprintf (stdout,
-	 "dsaX_bfCorr [options]\n"
-	 " -c core   bind process to CPU core [no default]\n"
-	 " -d send debug messages to syslog\n"
-	 " -i in_key [default REORDER_BLOCK_KEY]\n"
-	 " -o out_key [default XGPU_BLOCK_KEY]\n"
-	 " -b run beamformer [default is to run correlator]\n"
-	 " -h print usage\n"
-	 " -t binary file for test mode\n"
-	 " -f flagants file\n"
-	 " -a calib file\n"
-	 " -s start frequency (assumes -0.244140625MHz BW)\n");
-}
-
-// kernel to fluff input
-// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
-__global__ void corr_input_copy(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
-  int tidx = threadIdx.x; // assume 128
-  int iidx = bidx*128+tidx;
-  
-  inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
-  ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
-
-}
-
-// transpose kernel
-// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
-// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
-// here, width is the dimension of the fastest index
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
-
-  __shared__ in_prec tile[32][33];
   
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
-}
-
-
-// function to copy and reorder d_input to d_r and d_i
-// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
-// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
-// then fluffs using simple kernel
-void reorder_input(char *input, char * tx, half *inr, half *ini) {
-
-  // transpose input data
-  dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix<<<dimGrid,dimBlock>>>(input,tx);
-  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128,128>>>(tx,inr,ini);
-}
-
-// kernel to help with reordering output
-// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
-// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
-__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
-
-  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
-  int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
-  
-  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
-  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
-  int ch = (int)(chpol / 2);
-  int base_idx = indices_lookup[baseline];
-  int iidx = base_idx * NCHAN_PER_PACKET + ch;
-  int pol = (int)(chpol % 2);
-
-  float v1=0., v2=0.;
-  
-  for (int i=0;i<halfFac;i++) {
-    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
-    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
-  }
-
-  output[2*idx] = v1;
-  output[2*idx+1] = v2;
-  
-}
-
-
-// function to copy d_outr and d_outi to d_output
-// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
-// the corr matrices are column major order
-// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
-// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
-void reorder_output(dmem * d) {
-
-  // transpose input data
-  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32);
-  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
-  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
-
-  // look at output
-  /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
-  cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
-  FILE *fout;
-  fout=fopen("test2.test","wb");
-  fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout);
-  fclose(fout);*/
-
-  
-  /*
-  // set up for geam
-  cublasHandle_t cublasH = NULL;
-  cudaStream_t stream = NULL;
-  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  cublasSetStream(cublasH, stream);
-
-  // transpose output matrices into tx_outr and tx_outi
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  const int m = NCHAN_PER_PACKET*2*2;
-  const int n = NANTS*NANTS/16; // columns in output
-  const double alpha = 1.0;
-  const double beta = 0.0;
-  const int lda = n;
-  const int ldb = m;
-  const int ldc = ldb;
-  cublasDgeam(cublasH,transa,transb,m,n,
-	      &alpha,(double *)(d->d_outr),
-	      lda,&beta,(double *)(d->d_tx_outr),
-	      ldb,(double *)(d->d_tx_outr),ldc);
-  cublasDgeam(cublasH,transa,transb,m,n,
-	      &alpha,(double *)(d->d_outi),
-	      lda,&beta,(double *)(d->d_tx_outi),
-	      ldb,(double *)(d->d_tx_outi),ldc);
-  */
-  // now run kernel to sum into output
-  int * h_idxs = (int *)malloc(sizeof(int)*NBASE);
-  int * d_idxs;
-  cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE);
-  int ii = 0;
-  // upper triangular order (column major) to match xGPU (not the same as CASA!)
-  for (int i=0;i<NANTS;i++) {
-    for (int j=0;j<=i;j++) {
-      h_idxs[ii] = i*NANTS + j;
-      ii++;
-    }
-  }
-  cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice);
-
-  // run kernel to finish things
-  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs);
-
-  /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
-  cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
-  FILE *fout;
-  fout=fopen("test3.test","wb");
-  fwrite(odata,sizeof(char),384*4*NBASE*4,fout);
-  fclose(fout);*/
-
+  if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out");
+  dada_hdu_destroy (out);
   
-  cudaFree(d_idxs);
-  free(h_idxs);
-  //cudaStreamDestroy(stream);  
+} 
 
+void usage() {
+  fprintf (stdout,
+	   "dsaX_beamformer_correlator [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default REORDER_BLOCK_KEY]\n"
+	   " -o out_key [default XGPU_BLOCK_KEY]\n"
+	   " -b run beamformer [default is to run correlator]\n"
+	   " -h print usage\n"
+	   " -t binary file for test mode\n"
+	   " -f flagants file\n"
+	   " -a calib file\n"
+	   " -s start frequency (assumes -0.244140625MHz BW)\n");
 }
 
-
-
 // correlator function
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
-void dcorrelator(dmem * d) {
+void dcorrelator(dmem *d) {
 
   // zero out output arrays
-  cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  cudaMemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  cudaMemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  cudaMemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
   
   // copy to device
-  cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice);
+  cudaMemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, cudaMemcpyHostToDevice);
 
   // reorder input
-  reorder_input(d->d_input, d->d_tx, d->d_r, d->d_i);
+  reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i);
 
+  // ABSTRACT HERE START
   // not sure if essential
   cudaDeviceSynchronize();
   
@@ -401,120 +122,10 @@ void dcorrelator(dmem * d) {
   // destroy stream
   cudaStreamDestroy(stream);
   cublasDestroy(cublasH);
+  // ABSTRACT HERE END
   
   // reorder output data
-  reorder_output(d);
-  
-}
-
-// kernels to reorder and fluff input data for beamformer
-// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
-// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]      // run as 16x16 tiled transpose with 32-byte words 
-// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
-// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
-// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
-__global__ void transpose_input_bf(double * idata, double * odata) {
-
-  __shared__ double tile[16][17][4];
-  
-  int x = blockIdx.x * 16 + threadIdx.x;
-  int y = blockIdx.y * 16 + threadIdx.y;
-  int width = gridDim.x * 16;
-
-  for (int j = 0; j < 16; j += 8) {
-    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
-    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
-    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
-    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
-  }
-  
-  __syncthreads();
-
-  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 16 + threadIdx.y;
-  width = gridDim.y * 16;
-
-  for (int j = 0; j < 16; j += 8) {
-    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
-    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
-    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
-    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
-  }
-
-}
-
-// kernel to fluff input bf data
-// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
-__global__ void fluff_input_bf(char * input, half * dr, half * di) {
-
-  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
-  int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
-
-  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
-  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
-  
-}
-
-// transpose, add and scale kernel for bf
-// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
-// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
-// scf is a per-beam scale factor to enable recasting as unsigned char
-__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) {
-
-  __shared__ float tile[16][17];
-  
-  int x = blockIdx.x * 16 + threadIdx.x;
-  int y = blockIdx.y * 16 + threadIdx.y;
-  int width = gridDim.x * 16;
-  float dr, di;
-
-  for (int j = 0; j < 16; j += 8) {
-    dr = (float)(ir[(y+j)*width + x]);
-    di = (float)(ii[(y+j)*width + x]);
-    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
-  }
-
-  __syncthreads();
-
-  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 16 + threadIdx.y;
-  width = gridDim.y * 16;
-
-  for (int j = 0; j < 16; j += 8)
-    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
-
-}
-
-// sum over all times in output beam array
-// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
-__global__ void sum_beam(unsigned char * input, float * output) {
-
-  __shared__ float summ[512];
-  int bidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  //int idx = bidx*256+tidx;
-  int bm = (int)(bidx/48);
-  int ch = (int)(bidx % 48);
-
-  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
-
-  __syncthreads();
-
-  if (tidx<256) {
-    summ[tidx] += summ[tidx+256];
-    summ[tidx] += summ[tidx+128];
-    summ[tidx] += summ[tidx+64];
-    summ[tidx] += summ[tidx+32];
-    summ[tidx] += summ[tidx+16];
-    summ[tidx] += summ[tidx+8];
-    summ[tidx] += summ[tidx+4];
-    summ[tidx] += summ[tidx+2];
-    summ[tidx] += summ[tidx+1];
-  }
-
-  if (tidx==0) output[bidx] = summ[tidx];
-  
+  reorder_output_device(d);
 }
 
 /*
@@ -635,13 +246,9 @@ void dbeamformer(dmem * d) {
     transpose_scale_bf<<<dimGrid,dimBlock>>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
     end = clock();
     d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
-      
-
   }
 
   cudaStreamDestroy(stream);
-
-
   cublasDestroy(cublasH);
 
   // form sum over times
@@ -649,126 +256,9 @@ void dbeamformer(dmem * d) {
   
 }
 
-// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
-// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
-__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
-
-  int bidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  int inidx = bidx*128+tidx;  
-  
-  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
-  
-  // get indices
-  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
-  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
-  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
-  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
-  int bm = (int)(idx / (128*(NANTS/2)));
-  int tactp = (int)(idx % (128*(NANTS/2)));
-  //int t = (int)(tactp / (32*(NANTS/2)));
-  int actp = (int)(tactp % (32*(NANTS/2)));
-  int a = (int)(actp / 32);
-  int ctp = (int)(actp % 32);
-  //int c = (int)(ctp / 4);
-  int tp = (int)(ctp % 4);
-  //int t2 = (int)(tp / 2);
-  int pol = (int)(tp % 2);
-  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
-  
-  // calculate weights
-  float theta, afac, twr, twi;
-  if (iArm==0) {
-    theta = sep*(127.-bm*1.)*PI/10800.; // radians
-    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
-    twr = cos(afac*antpos_e[a+48*iArm]);
-    twi = sin(afac*antpos_e[a+48*iArm]);
-    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
-    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
-    //wr[inidx] = __float2half(calibs[widx]);
-    //wi[inidx] = __float2half(calibs[widx+1]);
-  }
-  if (iArm==1) {
-    theta = sep*(127.-bm*1.)*PI/10800.; // radians
-    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
-    twr = cos(afac*antpos_n[a+48*iArm]);
-    twi = sin(afac*antpos_n[a+48*iArm]);
-    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
-    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
-    //wr[inidx] = __float2half(calibs[widx]);
-    //wi[inidx] = __float2half(calibs[widx+1]);
-  }
-    
-}
-
-// GPU-powered function to populate weights matrix for beamformer
-// file format:
-// sequential pairs of eastings and northings
-// then [NANTS, 48, R/I] calibs
-
-void calc_weights(dmem * d) {
-
-  // allocate
-  float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
-  float *antpos_n = (float *)malloc(sizeof(float)*NANTS);
-  float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
-  float *d_antpos_e, *d_antpos_n, *d_calibs;
-  float wnorm;
-  cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS);
-  cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS);
-  cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
-
-  // deal with antpos and calibs
-  //int iant;
-  //int found;
-  for (int i=0;i<NANTS;i++) {
-    antpos_e[i] = d->h_winp[2*i];
-    antpos_n[i] = d->h_winp[2*i+1];
-  }
-  for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
-
-    // DEBUG CODE?
-    //iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
-    //found = 0;
-    //for (int j=0;j<d->nflags;j++)
-    //if (d->flagants[j]==iant) found = 1;
-
-    calibs[2*i] = d->h_winp[2*NANTS+2*i];
-    calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1];
-
-    wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]);
-    if (wnorm!=0.0) {
-      calibs[2*i] /= wnorm;
-      calibs[2*i+1] /= wnorm;
-    }
-
-    //if (found==1) {
-    //calibs[2*i] = 0.;
-    //calibs[2*i+1] = 0.;
-    //}
-  }
-
-  //for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) printf("%f %f\n",calibs[2*i],calibs[2*i+1]);
-  
-  cudaMemcpy(d_antpos_e,antpos_e,NANTS*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(d_antpos_n,antpos_n,NANTS*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
-
-  // run kernel to populate weights matrix
-  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs);  
-  
-  // free stuff
-  cudaFree(d_antpos_e);
-  cudaFree(d_antpos_n);
-  cudaFree(d_calibs);
-  free(antpos_e);
-  free(antpos_n);
-  free(calibs);
-  
-}
 
 // MAIN
-
+#if 0
 int main (int argc, char *argv[]) {
 
   cudaSetDevice(0);
@@ -903,7 +393,7 @@ int main (int argc, char *argv[]) {
 	      return EXIT_FAILURE;
 	    }
 	case 'd':
-	  DEBUG=1;
+	  //DEBUG=1;
 	  syslog (LOG_DEBUG, "Will excrete all debug messages");
 	  break;
 	case 'b':
@@ -1012,7 +502,7 @@ int main (int argc, char *argv[]) {
     free(d.h_input);
     free(output_data);
     //free(o1);
-    deallocate(&d,bf);
+    deallocate_device_memory(&d,bf);
 
     exit(1);
   }
@@ -1154,9 +644,9 @@ int main (int argc, char *argv[]) {
 
   // finish up
   free(output_buffer);
-  deallocate(&d,bf);
+  deallocate_device_memory(&d,bf);
   dsaX_dbgpu_cleanup (hdu_in, hdu_out);
   
 }
-
+#endif
 

From 85555743eb1f7c3e223da7325f3c8d87b4cb774c Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Sat, 22 Jun 2024 01:02:25 -0700
Subject: [PATCH 18/30] Begin work on fully arch-independent correlator and
 beamformer, add a skeleton test suite

---
 CMakeLists.txt                             |   6 +-
 include/CMakeLists.txt                     |  15 +
 include/dsaX.h                             |  10 +-
 include/dsaX_beamformer_correlator.h       |   9 +
 include/dsaX_blas_interface.h              |   5 +
 include/dsaX_cublas_interface.h            |   5 +
 include/dsaX_cuda_headers.h                |   6 +
 include/dsaX_cuda_interface.h              |  31 +
 include/dsaX_enums.h                       |  33 ++
 include/dsaX_psrdada_headers.h             |  12 +
 include/dsaX_utils.h                       |   9 +
 src/10_planar_complex.cu                   | 567 +++++++++++++++++++
 src/11_planar_complex_array.cu             | 628 +++++++++++++++++++++
 src/CMakeLists.txt                         |  14 +-
 src/dsaX_beamformer_correlator.cu          | 122 ++--
 src/dsaX_blas_interface.cu                 |  11 +
 src/dsaX_cublas_interface.cu               |  92 +++
 src/dsaX_cuda_interface.cu                 | 467 +++++++++++++++
 src/dsaX_utils.cpp                         |  30 +
 src/planar_complex.cu                      |  87 +++
 tests/CMakeLists.txt                       |   5 +
 tests/CMakeLists.txt~                      |   5 +
 tests/dsaX_beamformer_correlator_test.cpp  | 399 +++++++++++++
 tests/dsaX_beamformer_correlator_test.cpp~ | 398 +++++++++++++
 {utils => tests/utils}/.gitignore          |   0
 tests/utils/CMakeLists.txt                 |  11 +
 tests/utils/CMakeLists.txt~                |  22 +
 {utils => tests/utils}/gen_packet.py       |   0
 {utils => tests/utils}/get_rms.py          |   0
 {utils => tests/utils}/get_rms_packet.py   |   0
 {utils => tests/utils}/packet.out          | Bin
 {utils => tests/utils}/sockets.py          |   0
 {utils => tests/utils}/test.out            | Bin
 33 files changed, 2909 insertions(+), 90 deletions(-)
 create mode 100644 include/CMakeLists.txt
 create mode 100644 include/dsaX_beamformer_correlator.h
 create mode 100644 include/dsaX_blas_interface.h
 create mode 100644 include/dsaX_cublas_interface.h
 create mode 100644 include/dsaX_cuda_headers.h
 create mode 100644 include/dsaX_cuda_interface.h
 create mode 100644 include/dsaX_enums.h
 create mode 100644 include/dsaX_psrdada_headers.h
 create mode 100644 include/dsaX_utils.h
 create mode 100644 src/10_planar_complex.cu
 create mode 100644 src/11_planar_complex_array.cu
 create mode 100644 src/dsaX_blas_interface.cu
 create mode 100644 src/dsaX_cublas_interface.cu
 create mode 100644 src/dsaX_cuda_interface.cu
 create mode 100644 src/dsaX_utils.cpp
 create mode 100644 src/planar_complex.cu
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/CMakeLists.txt~
 create mode 100644 tests/dsaX_beamformer_correlator_test.cpp
 create mode 100644 tests/dsaX_beamformer_correlator_test.cpp~
 rename {utils => tests/utils}/.gitignore (100%)
 create mode 100644 tests/utils/CMakeLists.txt
 create mode 100644 tests/utils/CMakeLists.txt~
 rename {utils => tests/utils}/gen_packet.py (100%)
 rename {utils => tests/utils}/get_rms.py (100%)
 rename {utils => tests/utils}/get_rms_packet.py (100%)
 rename {utils => tests/utils}/packet.out (100%)
 rename {utils => tests/utils}/sockets.py (100%)
 rename {utils => tests/utils}/test.out (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f29ca32..441ae7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C)
 set(CMAKE_C_FLAGS "-pthread")
 set(CMAKE_CXX_FLAGS "-pthread")
 
-# add a directory for cmake modules                                                                                                                                                                                
+# add a directory for cmake modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
 # DSA_XENGINE may be built to run using CUDA or CPU. Future version may be
@@ -48,6 +48,7 @@ if(TARGET_TYPE_VALID LESS 0)
   message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
 endif()
 
+
 # Git helpers
 #------------
 find_package(Git)
@@ -268,9 +269,8 @@ else()
 endif()
 
 
-# Add src, utils, include, tests, and legacy
+# Add src, include, tests, and legacy
 add_subdirectory(src)
-add_subdirectory(utils)
 add_subdirectory(include)
 add_subdirectory(tests)
 option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..1bbdfda
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,15 @@
+enable_language(CUDA)
+
+# install step for header files
+#------------------------------
+set(DSA_XENGINE_HEADERS
+  # cmake-format: sortable
+  dsaX_cuda_headers.h
+  dsaX_capture.h
+  dsaX_capture_manythread.h
+  dsaX_capture_pcap.h
+  dsaX_def.h
+  dsaX_cutlass_interface.h
+  )
+install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+#------------------------------
diff --git a/include/dsaX.h b/include/dsaX.h
index ddd351a..2ee856a 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -62,10 +62,10 @@ typedef struct dmem {
 } dmem;
 
 // Structure that carries BLAS parameters
-typedef struct dsaBLASParam_s {  
+typedef struct dsaXBLASParam_s {  
   size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
   
-  dsaBLASType blas_type;    /**< Type of BLAS computation to perfrom */
+  dsaXBLASType blas_type;    /**< Type of BLAS computation to perfrom */
   
   // GEMM params
   dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
@@ -87,10 +87,10 @@ typedef struct dsaBLASParam_s {
   
   // Common params
   int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
-  dsaBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
-  dsaBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
   
-} dsaBLASParam;
+} dsaXBLASParam;
   
 
 // Initialise device memory
diff --git a/include/dsaX_beamformer_correlator.h b/include/dsaX_beamformer_correlator.h
new file mode 100644
index 0000000..7001f4a
--- /dev/null
+++ b/include/dsaX_beamformer_correlator.h
@@ -0,0 +1,9 @@
+#pragma once
+
+// correlator function
+// workflow: copy to device, reorder, stridedBatchedGemm, reorder
+void dcorrelator(dmem *d);
+
+// beamformer function
+void dbeamformer(dmem * d);
+
diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h
new file mode 100644
index 0000000..3cf5c4a
--- /dev/null
+++ b/include/dsaX_blas_interface.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "dsaX.h"
+
+void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param);
diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h
new file mode 100644
index 0000000..9265f37
--- /dev/null
+++ b/include/dsaX_cublas_interface.h
@@ -0,0 +1,5 @@
+#pragma once
+#include "dsaX.h"
+#include "dsaX_cuda_headers.h"
+
+void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param);
diff --git a/include/dsaX_cuda_headers.h b/include/dsaX_cuda_headers.h
new file mode 100644
index 0000000..acc838d
--- /dev/null
+++ b/include/dsaX_cuda_headers.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <cuda.h>
+#include "cuda_fp16.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
new file mode 100644
index 0000000..99b1db2
--- /dev/null
+++ b/include/dsaX_cuda_interface.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "dsaX.h"
+#include "dsaX_def.h"
+
+void initialize_device_memory(dmem * d, int bf);
+
+void deallocate_device_memory(dmem * d, int bf);
+
+void reorder_output_device(dmem * d);
+
+__global__ void corr_input_copy(char *input, half *inr, half *ini);
+
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata);
+
+void reorder_input_device(char *input, char * tx, half *inr, half *ini);
+
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup);
+
+__global__ void transpose_input_bf(double * idata, double * odata);
+
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs);
+
+void calc_weights(dmem * d);
+
+__global__ void fluff_input_bf(char * input, half * dr, half * di);
+
+__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata);
+
+__global__ void sum_beam(unsigned char * input, float * output);
+
diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h
new file mode 100644
index 0000000..b188019
--- /dev/null
+++ b/include/dsaX_enums.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#define DSA_INVALID_ENUM (-0x7fffffff - 1)
+
+typedef enum dsaError_t { DSA_SUCCESS = 0, DSA_ERROR = 1, DSA_ERROR_UNINITIALIZED = 2 } dsaError_t;
+
+typedef enum dsaBLASOperation_s {				 
+  DSA_BLAS_OP_N = 0, // No transpose
+  DSA_BLAS_OP_T = 1, // Transpose only
+  DSA_BLAS_OP_C = 2, // Conjugate transpose
+  DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM
+} dsaBLASOperation;
+
+typedef enum dsaXBLASType_s {
+  DSA_BLAS_GEMM = 0,
+  DSA_BLAS_INVALID = DSA_INVALID_ENUM
+} dsaXBLASType;
+
+typedef enum dsaXBLASDataType_s {
+  DSA_BLAS_DATATYPE_H = 0, // Half
+  DSA_BLAS_DATATYPE_S = 1, // Single
+  DSA_BLAS_DATATYPE_D = 2, // Double
+  DSA_BLAS_DATATYPE_HC = 3, // Complex(half)
+  DSA_BLAS_DATATYPE_C = 4, // Complex(single)
+  DSA_BLAS_DATATYPE_Z = 5, // Complex(double)
+  DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM
+} dsaXBLASDataType;
+
+typedef enum dsaXBLASDataOrder_s {
+  DSA_BLAS_DATAORDER_ROW = 0,
+  DSA_BLAS_DATAORDER_COL = 1,
+  DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM
+} dsaXBLASDataOrder;
diff --git a/include/dsaX_psrdada_headers.h b/include/dsaX_psrdada_headers.h
new file mode 100644
index 0000000..325dcb8
--- /dev/null
+++ b/include/dsaX_psrdada_headers.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include "dsaX_enums.h"
diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h
new file mode 100644
index 0000000..3976db7
--- /dev/null
+++ b/include/dsaX_utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "dsaX.h"
+
+void dsaXmemset(void *array, int ch, size_t n);
+
+void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n);
+void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n);
+
diff --git a/src/10_planar_complex.cu b/src/10_planar_complex.cu
new file mode 100644
index 0000000..9e0915d
--- /dev/null
+++ b/src/10_planar_complex.cu
@@ -0,0 +1,567 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex GEMM
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting
+  the batched strided mode.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile
+  size, and alignment. This can result in long compile times.
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+    
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
+
+    $ make 10_planar_complex
+
+    $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "10_planar_complex example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/10_planar_complex/10_planar_complex  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<ElementA> tensor_A;
+  cutlass::DeviceAllocation<ElementB> tensor_B;
+  cutlass::DeviceAllocation<ElementC> tensor_C;
+  cutlass::DeviceAllocation<ElementC> tensor_D;
+  cutlass::DeviceAllocation<ElementC> tensor_D_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched strided GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    ElementA *ptr_A = tensor_A.get();
+    ElementB *ptr_B = tensor_B.get();
+    ElementC *ptr_C = tensor_C.get();
+    ElementC *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMMs
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex(
+        problem_size.m(),                                 // GEMM M dimension
+        problem_size.n(),                                 // GEMM N dimension
+        problem_size.k(),                                 // GEMM K dimension
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+        ptr_A,                                            // Pointer to real part of A matrix
+        ptr_A + imag_stride_A,                            // Pointer to imaginary part of A matrix
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+        ptr_B,                                            // Pointer to real part of B matrix
+        ptr_B + imag_stride_B,                            // Pointer to imaginary part of B matrix
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C,                                            // Pointer to real part of C matrix
+        ptr_C + imag_stride_C,                            // Pointer to imaginary part of C matrix
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D,                                            // Pointer to real part of D matrix
+        ptr_D + imag_stride_D,                            // Pointer to imaginary part of D matrix
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd,                                              // Leading dimension of imaginary part of D matrix
+
+        batch_count,                                      // Number of batched elements
+
+        batch_stride_A,                                   // Stride between batches of real parts of A matrix
+        batch_stride_A,                                   // Stride between batches of imaginary parts of A matrix
+
+        batch_stride_B,                                   // Stride between batches of real parts of B matrix
+        batch_stride_B,                                   // Stride between batches of imaginary parts of B matrix
+
+        batch_stride_C,                                   // Stride between batches of real parts of C matrix
+        batch_stride_C,                                   // Stride between batches of imaginary parts of C matrix
+
+        batch_stride_D,                                   // Stride between batches of real parts of D matrix
+        batch_stride_D                                    // Stride between batches of imaginary parts of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMMs are complete
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+        ElementC epsilon = 0.1_hf;
+        ElementC nonzero_floor = 0.1_hf;
+
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this test passes on older architectures even though its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu
new file mode 100644
index 0000000..ba94b60
--- /dev/null
+++ b/src/11_planar_complex_array.cu
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex Array Example
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
+  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
+  in global memory.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
+
+    $ make 11_planar_complex_array
+
+    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "11_planar_complex_array example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  // Half-precision input and output
+  using Element = cutlass::half_t;
+
+  // Configurations for layouts and internal computation
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<Element> tensor_A;
+  cutlass::DeviceAllocation<Element> tensor_B;
+  cutlass::DeviceAllocation<Element> tensor_C;
+  cutlass::DeviceAllocation<Element> tensor_D;
+  cutlass::DeviceAllocation<Element> tensor_D_ref;
+
+  cutlass::DeviceAllocation<void *> ptr_A_real;
+  cutlass::DeviceAllocation<void *> ptr_A_imag;
+  cutlass::DeviceAllocation<void *> ptr_B_real;
+  cutlass::DeviceAllocation<void *> ptr_B_imag;
+  cutlass::DeviceAllocation<void *> ptr_C_real;
+  cutlass::DeviceAllocation<void *> ptr_C_imag;
+  cutlass::DeviceAllocation<void *> ptr_D_real;
+  cutlass::DeviceAllocation<void *> ptr_D_imag;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched planar complex GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+
+    ptr_A_real.reset(batch_count);
+    ptr_A_imag.reset(batch_count);
+    ptr_B_real.reset(batch_count);
+    ptr_B_imag.reset(batch_count);
+    ptr_C_real.reset(batch_count);
+    ptr_C_imag.reset(batch_count);
+    ptr_D_real.reset(batch_count);
+    ptr_D_imag.reset(batch_count);
+
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    Element *ptr_A = tensor_A.get();
+    Element *ptr_B = tensor_B.get();
+    Element *ptr_C = tensor_C.get();
+    Element *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+    
+    //
+    // Configure pointers in global memory
+    //
+
+    struct {
+      Element *base;
+      void **ptr_real;
+      void **ptr_imag;
+      int64_t batch_stride;
+      int64_t imag_stride;
+    } tensors[] = {
+      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
+    };
+
+    for (auto const &tensor : tensors) {
+      for (int idx = 0; idx < batch_count; ++idx) {
+
+        void *ptr_real = tensor.base + idx * tensor.batch_stride;
+        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
+
+        cudaError_t error = cudaMemcpy(
+          tensor.ptr_real + idx,
+          &ptr_real,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+
+        error = cudaMemcpy(
+          tensor.ptr_imag + idx,
+          &ptr_imag,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+      }
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex array GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex_array(
+
+        problem_size.m(),                                 // expected GEMM M dimension
+        problem_size.n(),                                 // expected GEMM N dimension
+        problem_size.k(),                                 // expected GEMM K dimension
+        batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          Element, LayoutA,
+          Element, LayoutB,
+          Element, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+	Element epsilon = 0.1_hf;
+	Element nonzero_floor = 0.1_hf;
+	
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a55c8fe..e23827b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,7 +4,19 @@ include_directories(../include)
 
 # DSA Fast Time Domain library
 #-----------------------------
-add_library(dsa SHARED dsaX_cuda_interface.cu dsaX_blas_interface.cu dsaX_beamformer_correlator.cu)
+add_library(dsa SHARED
+  dsaX_cuda_interface.cu
+  dsaX_blas_interface.cu
+  dsaX_beamformer_correlator.cu
+  dsaX_utils.cpp
+  )
+
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+  add_compile_definitions(DSA_XENGINE_TARGET_CUDA)
+endif()
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU)
+  add_compile_definitions(DSA_XENGINE_TARGET_CPU)
+endif()
 
 if(CUDAToolkit_FOUND)
   target_link_libraries(dsa CUDA::cudart)
diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu
index c91c1b7..ddbc73c 100644
--- a/src/dsaX_beamformer_correlator.cu
+++ b/src/dsaX_beamformer_correlator.cu
@@ -9,13 +9,11 @@ Workflow is similar for BF and corr applications
 #include "dsaX_def.h"
 #include "dsaX.h"
 #include "dsaX_blas_interface.h"
-
-//#include <cuda.h>
-//#include "cuda_fp16.h"
-//#include <cublas_v2.h>
-//#include <cuda_runtime.h>
-
+#include "dsaX_utils.h"
+#include "dsaX_blas_interface.h"
+#ifdef DSA_XENGINE_TARGET_CUDA
 #include "dsaX_cuda_interface.h"
+#endif
 
 int DEBUG = 1;
 
@@ -48,81 +46,43 @@ void usage() {
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
 void dcorrelator(dmem *d) {
 
+  // copy to device
+  dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  
   // zero out output arrays
-  cudaMemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  cudaMemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  cudaMemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
   
-  // copy to device
-  cudaMemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, cudaMemcpyHostToDevice);
-
   // reorder input
   reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i);
 
-  // ABSTRACT HERE START
-  // not sure if essential
-  cudaDeviceSynchronize();
-  
-  // set up for gemm
-  cublasHandle_t cublasH = NULL;
-  cudaStream_t stream = NULL;
-  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  cublasCreate(&cublasH);
-  cublasSetStream(cublasH, stream);
-
+  dsaXBLASParam blas_param;
   // gemm settings
   // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
   // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] 
-  cublasOperation_t transa = CUBLAS_OP_N;
-  cublasOperation_t transb = CUBLAS_OP_T;
-  const int m = NANTS;
-  const int n = NANTS;
-  const int k = NPACKETS_PER_BLOCK/halfFac;
-  const half alpha = 1.;
-  const half malpha = -1.;
-  const int lda = m;
-  const int ldb = n;
-  const half beta0 = 0.;
-  const half beta1 = 1.;
-  const int ldc = m;
-  const long long int strideA = NPACKETS_PER_BLOCK*NANTS/halfFac;
-  const long long int strideB = NPACKETS_PER_BLOCK*NANTS/halfFac;
-  const long long int strideC = NANTS*NANTS;
-  const int batchCount = NCHAN_PER_PACKET*2*2*halfFac;
-
-  // run strided batched gemm
-  // ac
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_r,lda,strideA,
-			    d->d_r,ldb,strideB,&beta0,
-			    d->d_outr,ldc,strideC,
-			    batchCount);
-  // bd
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_i,lda,strideA,
-			    d->d_i,ldb,strideB,&beta1,
-			    d->d_outr,ldc,strideC,
-			    batchCount);
-  // -bc
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &malpha,d->d_i,lda,strideA,
-			    d->d_r,ldb,strideB,&beta0,
-			    d->d_outi,ldc,strideC,
-			    batchCount);
-  // ad
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_r,lda,strideA,
-			    d->d_i,ldb,strideB,&beta1,
-			    d->d_outi,ldc,strideC,
-			    batchCount);
-
-  // shown to be essential
-  cudaDeviceSynchronize();
-
-  // destroy stream
-  cudaStreamDestroy(stream);
-  cublasDestroy(cublasH);
+  blas_param.trans_a = DSA_BLAS_OP_N;
+  blas_param.trans_b = DSA_BLAS_OP_T;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+
+  // Perform GEMM accoring to back end configuration
+  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
+  
+  /*
+  // ABSTRACT HERE START
   // ABSTRACT HERE END
+  */
   
   // reorder output data
   reorder_output_device(d);
@@ -174,7 +134,7 @@ void dbeamformer(dmem * d) {
 
   // do big memcpy
   begin = clock();
-  cudaMemcpy(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4,cudaMemcpyHostToDevice);
+  dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4);
   end = clock();
   d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
   
@@ -182,8 +142,8 @@ void dbeamformer(dmem * d) {
   for (int iArm=0;iArm<2;iArm++) {
   
     // zero out output arrays
-    cudaMemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
-    cudaMemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
     cudaDeviceSynchronize();
     
     // copy data to device
@@ -443,8 +403,8 @@ int main (int argc, char *argv[]) {
 
     for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
       d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
-    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
-
+    dsaXmemcpyHostToDevice(d.d_freqs, d.h_freqs, sizeof(float)*(NCHAN_PER_PACKET/8));
+    
     // calculate weights
     calc_weights(&d);
     
@@ -471,7 +431,7 @@ int main (int argc, char *argv[]) {
       if (DEBUG) syslog(LOG_INFO,"copy to host");
       output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
       output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+      dsaXmemcpyDeviceToHost(output_data, d.d_output, output_size);
 
       fout = fopen("output.dat","wb");
       fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
@@ -483,7 +443,7 @@ int main (int argc, char *argv[]) {
       if (DEBUG) syslog(LOG_INFO,"copy to host");
       output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
       output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
+      dsaXmemcpyDeviceToHost(output_data, d.d_bigpower, output_size);
 
       /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
       o1 = (char *)malloc(output_size);
@@ -606,13 +566,13 @@ int main (int argc, char *argv[]) {
       if (DEBUG) syslog(LOG_INFO,"run correlator");
       dcorrelator(&d);
       if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
+      dsaXmemcpyDeviceToHost(output_buffer, d.d_output, block_out);
     }
     else {
       if (DEBUG) syslog(LOG_INFO,"run beamformer");
       dbeamformer(&d);
       if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
+      dsaMXmemcpyDeviceToHost(output_buffer, d.d_bigpower, block_out);
     }
     //end = clock();
     //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu
new file mode 100644
index 0000000..430ba9e
--- /dev/null
+++ b/src/dsaX_blas_interface.cu
@@ -0,0 +1,11 @@
+#include <dsaX.h>
+#include "dsaX_cublas_interface.h"
+
+void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  dsaXHgemmStridedBatchedCuda(real_in, imag_in, real_out, imag_out, param);
+#else
+  std::cout "Not implemented" << std::endl;
+  exit(0);
+#endif
+}
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
new file mode 100644
index 0000000..4631516
--- /dev/null
+++ b/src/dsaX_cublas_interface.cu
@@ -0,0 +1,92 @@
+#include "dsaX_cublas_interface.h"
+
+void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  
+  // not sure if essential
+  cudaDeviceSynchronize();
+  
+  // Set up for gemm
+  cublasHandle_t cublasH = NULL;
+  cudaStream_t stream = NULL;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  cublasCreate(&cublasH);
+  cublasSetStream(cublasH, stream);
+
+  // Transfer params
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  switch (blas_param.trans_a) {
+  case DSA_BLAS_OP_N:
+    transa = CUBLAS_OP_N; break;
+  case DSA_BLAS_OP_T:
+    transa = CUBLAS_OP_T; break;
+  case DSA_BLAS_OP_C:
+    transa = CUBLAS_OP_C; break;
+  default:
+    std::cout << "Unknown cublas transpose" << std::end;
+  }
+
+  switch (blas_param.trans_b) {
+  case DSA_BLAS_OP_N:
+    transb = CUBLAS_OP_N; break;
+  case DSA_BLAS_OP_T:
+    transb = CUBLAS_OP_T; break;
+  case DSA_BLAS_OP_C:
+    transb = CUBLAS_OP_C; break;
+  default:
+    std::cout << "Unknown cublas transpose" << std::end;
+  }
+  
+  const int m = blas_param.m;
+  const int n = blas_param.n;
+  const int k = blas_param.k;
+  const half alpha = blas_param.alpha.real();
+  const half malpha = -1.0 * alpha;
+  const int lda = blas_param.lda;
+  const int ldb = blas_param.ldb;
+  const half beta0 = blas_param.beta.real();
+  const half beta1 = 1.0;
+  const int ldc = blas_param.ldc;
+  const long long int strideA = blas_param.a_stride;
+  const long long int strideB = blas_param.b_stride;
+  const long long int strideC = blas_param.c_stride;
+  const int batchCount = blas_param.batch_count;
+  
+  // run strided batched gemm for datatype (a + ib)(c + id)
+  // ac
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_r,lda,strideA,
+			    d->d_r,ldb,strideB,&beta0,
+			    d->d_outr,ldc,strideC,
+			    batchCount);
+  // bd
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_i,lda,strideA,
+			    d->d_i,ldb,strideB,&beta1,
+			    d->d_outr,ldc,strideC,
+			    batchCount);
+  // -bc
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &malpha,d->d_i,lda,strideA,
+			    d->d_r,ldb,strideB,&beta0,
+			    d->d_outi,ldc,strideC,
+			    batchCount);
+  // ad
+  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			    &alpha,d->d_r,lda,strideA,
+			    d->d_i,ldb,strideB,&beta1,
+			    d->d_outi,ldc,strideC,
+			    batchCount);
+
+  // shown to be essential
+  cudaDeviceSynchronize();
+
+  // destroy stream
+  cudaStreamDestroy(stream);
+  cublasDestroy(cublasH);  
+#else
+  std::cout "Not implemented" << std::endl;
+  exit(0);
+#endif
+}
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
new file mode 100644
index 0000000..31e44d0
--- /dev/null
+++ b/src/dsaX_cuda_interface.cu
@@ -0,0 +1,467 @@
+#include "dsaX_cuda_interface.h"
+
+// allocate device memory
+void initialize_device_memory(dmem * d, int bf) {
+  
+  // for correlator
+  if (bf==0) {
+    cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+    cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+    cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+    cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  }
+
+  // for beamformer
+  if (bf==1) {
+    cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+    cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+    cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+    cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
+    cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
+    cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
+
+    // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
+    d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
+    d->flagants = (int *)malloc(sizeof(int)*NANTS);
+    d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
+    cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
+
+    // timers
+    d->cp = 0.;
+    d->prep = 0.;
+    d->outp = 0.;
+    d->cubl = 0.;
+    
+  }  
+}
+
+// deallocate device memory
+void deallocate_device_memory(dmem * d, int bf) {
+  
+  cudaFree(d->d_input);
+
+  if (bf==0) {
+    cudaFree(d->d_r);
+    cudaFree(d->d_i);
+    cudaFree(d->d_tx);
+    cudaFree(d->d_output);
+    cudaFree(d->d_outr);
+    cudaFree(d->d_outi);
+    cudaFree(d->d_tx_outr);
+    cudaFree(d->d_tx_outi);
+  }
+  if (bf==1) {
+    cudaFree(d->d_tx);
+    cudaFree(d->d_br);
+    cudaFree(d->d_bi);
+    cudaFree(d->weights_r);
+    cudaFree(d->weights_i);
+    cudaFree(d->d_bigbeam_r);
+    cudaFree(d->d_bigbeam_i);
+    cudaFree(d->d_bigpower);
+    cudaFree(d->d_scf);
+    cudaFree(d->d_chscf);
+    free(d->h_winp);
+    free(d->flagants);
+    cudaFree(d->d_freqs);
+    free(d->h_freqs);
+  }  
+}
+
+// function to copy d_outr and d_outi to d_output
+// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
+// the corr matrices are column major order
+// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
+void reorder_output_device(dmem * d) {
+  
+  // transpose input data
+  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32);
+  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
+  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
+
+  // look at output
+  /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
+  cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
+  FILE *fout;
+  fout=fopen("test2.test","wb");
+  fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout);
+  fclose(fout);*/
+
+  
+  /*
+  // set up for geam
+  cublasHandle_t cublasH = NULL;
+  cudaStream_t stream = NULL;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  cublasSetStream(cublasH, stream);
+
+  // transpose output matrices into tx_outr and tx_outi
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  const int m = NCHAN_PER_PACKET*2*2;
+  const int n = NANTS*NANTS/16; // columns in output
+  const double alpha = 1.0;
+  const double beta = 0.0;
+  const int lda = n;
+  const int ldb = m;
+  const int ldc = ldb;
+  cublasDgeam(cublasH,transa,transb,m,n,
+	      &alpha,(double *)(d->d_outr),
+	      lda,&beta,(double *)(d->d_tx_outr),
+	      ldb,(double *)(d->d_tx_outr),ldc);
+  cublasDgeam(cublasH,transa,transb,m,n,
+	      &alpha,(double *)(d->d_outi),
+	      lda,&beta,(double *)(d->d_tx_outi),
+	      ldb,(double *)(d->d_tx_outi),ldc);
+  */
+  // now run kernel to sum into output
+  int * h_idxs = (int *)malloc(sizeof(int)*NBASE);
+  int * d_idxs;
+  cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE);
+  int ii = 0;
+  // upper triangular order (column major) to match xGPU (not the same as CASA!)
+  for (int i=0;i<NANTS;i++) {
+    for (int j=0;j<=i;j++) {
+      h_idxs[ii] = i*NANTS + j;
+      ii++;
+    }
+  }
+  cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice);
+
+  // run kernel to finish things
+  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs);
+
+  /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
+  cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
+  FILE *fout;
+  fout=fopen("test3.test","wb");
+  fwrite(odata,sizeof(char),384*4*NBASE*4,fout);
+  fclose(fout);*/
+
+  
+  cudaFree(d_idxs);
+  free(h_idxs);
+  //cudaStreamDestroy(stream);  
+
+}
+
+// kernel to fluff input
+// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
+__global__ void corr_input_copy(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
+  int tidx = threadIdx.x; // assume 128
+  int iidx = bidx*128+tidx;
+  
+  inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
+  ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
+
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
+
+  __shared__ in_prec tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+
+// function to copy and reorder d_input to d_r and d_i
+// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
+// then fluffs using simple kernel
+void reorder_input_device(char *input, char * tx, half *inr, half *ini) {
+
+  // transpose input data
+  dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
+  transpose_matrix<<<dimGrid,dimBlock>>>(input,tx);
+  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128,128>>>(tx,inr,ini);
+}
+
+// kernel to help with reordering output
+// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
+// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
+
+  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+  
+  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
+  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
+  int ch = (int)(chpol / 2);
+  int base_idx = indices_lookup[baseline];
+  int iidx = base_idx * NCHAN_PER_PACKET + ch;
+  int pol = (int)(chpol % 2);
+
+  float v1=0., v2=0.;
+  
+  for (int i=0;i<halfFac;i++) {
+    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
+    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
+  }
+
+  output[2*idx] = v1;
+  output[2*idx+1] = v2;
+  
+}
+
+// kernels to reorder and fluff input data for beamformer
+// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
+// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]      // run as 16x16 tiled transpose with 32-byte words 
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
+// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
+// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
+__global__ void transpose_input_bf(double * idata, double * odata) {
+
+  __shared__ double tile[16][17][4];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
+    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
+    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
+    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
+    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
+    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
+    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
+  }
+
+}
+
+// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
+// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  int inidx = bidx*128+tidx;  
+  
+  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
+  
+  // get indices
+  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
+  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
+  int bm = (int)(idx / (128*(NANTS/2)));
+  int tactp = (int)(idx % (128*(NANTS/2)));
+  //int t = (int)(tactp / (32*(NANTS/2)));
+  int actp = (int)(tactp % (32*(NANTS/2)));
+  int a = (int)(actp / 32);
+  int ctp = (int)(actp % 32);
+  //int c = (int)(ctp / 4);
+  int tp = (int)(ctp % 4);
+  //int t2 = (int)(tp / 2);
+  int pol = (int)(tp % 2);
+  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
+  
+  // calculate weights
+  float theta, afac, twr, twi;
+  if (iArm==0) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_e[a+48*iArm]);
+    twi = sin(afac*antpos_e[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+  if (iArm==1) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_n[a+48*iArm]);
+    twi = sin(afac*antpos_n[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+    
+}
+
+// GPU-powered function to populate weights matrix for beamformer
+// file format:
+// sequential pairs of eastings and northings
+// then [NANTS, 48, R/I] calibs
+
+void calc_weights(dmem * d) {
+
+  // allocate
+  float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
+  float *antpos_n = (float *)malloc(sizeof(float)*NANTS);
+  float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+  float *d_antpos_e, *d_antpos_n, *d_calibs;
+  float wnorm;
+  cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+
+  // deal with antpos and calibs
+  //int iant;
+  //int found;
+  for (int i=0;i<NANTS;i++) {
+    antpos_e[i] = d->h_winp[2*i];
+    antpos_n[i] = d->h_winp[2*i+1];
+  }
+  for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
+
+    // DEBUG CODE?
+    //iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
+    //found = 0;
+    //for (int j=0;j<d->nflags;j++)
+    //if (d->flagants[j]==iant) found = 1;
+
+    calibs[2*i] = d->h_winp[2*NANTS+2*i];
+    calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1];
+
+    wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]);
+    if (wnorm!=0.0) {
+      calibs[2*i] /= wnorm;
+      calibs[2*i+1] /= wnorm;
+    }
+
+    //if (found==1) {
+    //calibs[2*i] = 0.;
+    //calibs[2*i+1] = 0.;
+    //}
+  }
+
+  //for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) printf("%f %f\n",calibs[2*i],calibs[2*i+1]);
+  
+  cudaMemcpy(d_antpos_e,antpos_e,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_antpos_n,antpos_n,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
+
+  // run kernel to populate weights matrix
+  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs);  
+  
+  // free stuff
+  cudaFree(d_antpos_e);
+  cudaFree(d_antpos_n);
+  cudaFree(d_calibs);
+  free(antpos_e);
+  free(antpos_n);
+  free(calibs);
+  
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+__global__ void fluff_input_bf(char * input, half * dr, half * di) {
+
+  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+
+  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+  
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) {
+
+  __shared__ float tile[16][17];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+  float dr, di;
+
+  for (int j = 0; j < 16; j += 8) {
+    dr = (float)(ir[(y+j)*width + x]);
+    di = (float)(ii[(y+j)*width + x]);
+    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8)
+    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+__global__ void sum_beam(unsigned char * input, float * output) {
+
+  __shared__ float summ[512];
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  //int idx = bidx*256+tidx;
+  int bm = (int)(bidx/48);
+  int ch = (int)(bidx % 48);
+
+  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
+
+  __syncthreads();
+
+  if (tidx<256) {
+    summ[tidx] += summ[tidx+256];
+    summ[tidx] += summ[tidx+128];
+    summ[tidx] += summ[tidx+64];
+    summ[tidx] += summ[tidx+32];
+    summ[tidx] += summ[tidx+16];
+    summ[tidx] += summ[tidx+8];
+    summ[tidx] += summ[tidx+4];
+    summ[tidx] += summ[tidx+2];
+    summ[tidx] += summ[tidx+1];
+  }
+
+  if (tidx==0) output[bidx] = summ[tidx];
+  
+}
diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp
new file mode 100644
index 0000000..46abfc9
--- /dev/null
+++ b/src/dsaX_utils.cpp
@@ -0,0 +1,30 @@
+#include "dsaX_utils.h"
+#ifdef DSA_XENGINE_TARGET_CUDA
+#include "dsaX_cuda_headers.h"
+#endif
+
+void dsaXmemset(void *array, int ch, size_t n){
+#ifdef DSA_XENGINE_TARGET_CUDA
+  cudaMemset(array, ch, n);
+#else
+  emset(array, ch, n);
+#endif
+}
+
+void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n){
+#ifdef DSA_XENGINE_TARGET_CUDA
+  // Perform host to device memcopy on data
+  cudaMemcpy(array_device, array_host, n, cudaMemcpyHostToDevice);
+#else  
+  memcpy(array_device, array_host, n);
+#endif
+}
+
+void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){
+#ifdef DSA_XENGINE_TARGET_CUDA
+  // Perform host to device memcopy on data
+  cudaMemcpy(array_host, array_device, n, cudaMemcpyDeviceToHost);
+#else
+  memcpy(array_host, array_device, n);
+#endif
+}
diff --git a/src/planar_complex.cu b/src/planar_complex.cu
new file mode 100644
index 0000000..3fb8175
--- /dev/null
+++ b/src/planar_complex.cu
@@ -0,0 +1,87 @@
+/*
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/core_io.h>
+
+int main() {
+
+  cutlass::half_t x = 2.25_hf;
+
+  std::cout << x << std::endl;
+
+  return 0;
+}
+*/
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/util/host_tensor.h>
+
+int main() {
+
+  // Define the GEMM operation
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,                           // ElementA
+    cutlass::layout::ColumnMajor,              // LayoutA
+    cutlass::half_t,                           // ElementB
+    cutlass::layout::ColumnMajor,              // LayoutB
+    cutlass::half_t,                           // ElementOutput
+    cutlass::layout::ColumnMajor,              // LayoutOutput
+    float,                                     // ElementAccumulator
+    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
+  >;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
+
+  cutlass::half_t const *ptrA = A.device_data();
+  cutlass::half_t const *ptrB = B.device_data();
+  cutlass::half_t const *ptrC = C.device_data();
+  cutlass::half_t       *ptrD = C.device_data();
+
+  int lda = A.device_ref().stride(0);
+  int ldb = B.device_ref().stride(0);
+  int ldc = C.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
+  //
+  // Launch GEMM on the device
+  //
+ 
+  status = gemm_op({
+    {M, N, K},
+    {ptrA, lda},            // TensorRef to A device tensor
+    {ptrB, ldb},            // TensorRef to B device tensor
+    {ptrC, ldc},            // TensorRef to C device tensor
+    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
+    {alpha, beta}           // epilogue operation arguments
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  } else {
+    std::cout << "CUTLASS Success! " << std::endl;
+  }
+  
+  return 0;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..9d29854
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+#include_directories(../include)
+include_directories(${CLI11_SOURCE_DIR}/include/CLI)
+add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp)
+
diff --git a/tests/CMakeLists.txt~ b/tests/CMakeLists.txt~
new file mode 100644
index 0000000..f72156b
--- /dev/null
+++ b/tests/CMakeLists.txt~
@@ -0,0 +1,5 @@
+
+#include_directories(../include)
+include_directories(${CLI11_SOURCE_DIR}/src)
+add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp)
+
diff --git a/tests/dsaX_beamformer_correlator_test.cpp b/tests/dsaX_beamformer_correlator_test.cpp
new file mode 100644
index 0000000..3e723d0
--- /dev/null
+++ b/tests/dsaX_beamformer_correlator_test.cpp
@@ -0,0 +1,399 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+
+// Include the dsaX.h header in your application
+//#include <dsaX.h>
+
+int main(int argc, char **argv) {
+
+  /*
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  // DADA Header plus Data Unit 
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY;
+  key_t out_key = XGPU_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int bf = 0;
+  int test = 0;
+  char ftest[200], fflagants[200], fcalib[200];
+  float sfreq = 1498.75;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+            {
+	      test = 1;
+	      syslog(LOG_INFO, "test mode");
+	      if (sscanf (optarg, "%s", &ftest) != 1) {
+		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'a':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "read calib file %s",optarg);
+	      if (sscanf (optarg, "%s", &fcalib) != 1) {
+		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
+	      if (sscanf (optarg, "%s", &fflagants) != 1) {
+		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 's':
+	  if (optarg)
+            {
+	      sfreq = atof(optarg);
+	      syslog(LOG_INFO, "start freq %g",sfreq);
+ 	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-s flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  //DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'b':
+	  bf=1;
+	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0) {
+    if (dada_bind_thread_to_core(core) < 0)
+      syslog(LOG_ERR,"failed to bind to core %d", core);
+    syslog(LOG_NOTICE,"bound to core %d", core);
+  }
+
+  
+  // allocate device memory
+  dmem d;
+  initialize_device_memory(&d,bf);
+
+  // set up for beamformer
+  FILE *ff;
+  int iii;
+  if (bf) {
+
+    if (!(ff=fopen(fflagants,"r"))) {
+      syslog(LOG_ERR,"could not open flagants file\n");
+      exit(1);
+    }
+    d.nflags=0;
+    while (!feof(ff)) {
+      fscanf(ff,"%d\n",&d.flagants[iii]);
+      d.nflags++;
+    }
+    fclose(ff);
+
+    if (!(ff=fopen(fcalib,"rb"))) {
+      syslog(LOG_ERR,"could not open calibss file\n");
+      exit(1);
+    }
+    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
+    fclose(ff);
+
+    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
+      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
+    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
+
+    // calculate weights
+    calc_weights(&d);
+    
+  }
+
+  // test mode
+  FILE *fin, *fout;
+  uint64_t output_size;
+  char * output_data;//, * o1;
+  if (test) {
+
+    // read one block of input data    
+    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    for (int i=0;i<512;i++) {
+      fin = fopen(ftest,"rb");
+      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
+      fclose(fin);
+    }
+
+    // run correlator or beamformer, and output data
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+
+      fout = fopen("output.dat","wb");
+      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+      fclose(fout);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
+
+      // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
+      // o1 = (char *)malloc(output_size);
+      // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);
+	
+      
+
+      fout = fopen("output.dat","wb");
+      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
+      //fwrite(o1,1,output_size,fout);
+      fclose(fout);
+    }
+
+	
+    // free
+    free(d.h_input);
+    free(output_data);
+    //free(o1);
+    deallocate_device_memory(&d,bf);
+
+    exit(1);
+  }
+  
+
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  if (bf==0) 
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
+  else
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
+  uint64_t  bytes_read = 0;
+  //char * block;
+  char * output_buffer;
+  output_buffer = (char *)malloc(block_out);
+  uint64_t written, block_id;
+  
+  // get things started
+  bool observation_complete=0;
+  //bool started = 0;
+  syslog(LOG_INFO, "starting observation");
+  int blocks = 0;
+  //clock_t begin, end;
+  //double time_spent;
+  
+  while (!observation_complete) {
+
+    if (DEBUG) syslog(LOG_INFO,"reading block");    
+    
+    // open block
+    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    // do stuff
+    //begin = clock();
+    // loop
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
+    }
+    //end = clock();
+    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
+    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
+    
+    // write to output
+
+    // write to host
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
+    blocks++;
+    // loop end
+    
+      
+    // finish up
+    if (bytes_read < block_size)
+      observation_complete = 1;
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+  }
+
+  // finish up
+  free(output_buffer);
+  deallocate_device_memory(&d,bf);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+  return 0;
+  */
+}
diff --git a/tests/dsaX_beamformer_correlator_test.cpp~ b/tests/dsaX_beamformer_correlator_test.cpp~
new file mode 100644
index 0000000..30184b3
--- /dev/null
+++ b/tests/dsaX_beamformer_correlator_test.cpp~
@@ -0,0 +1,398 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+
+// Include the dsaX.h header in your application
+//#include <dsaX.h>
+
+int main(int argc, char **argv) {
+
+  // startup syslog message
+  // using LOG_LOCAL0
+  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
+  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
+  
+  // DADA Header plus Data Unit 
+  dada_hdu_t* hdu_in = 0;
+  dada_hdu_t* hdu_out = 0;
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY;
+  key_t out_key = XGPU_BLOCK_KEY;
+  
+  // command line arguments
+  int core = -1;
+  int arg = 0;
+  int bf = 0;
+  int test = 0;
+  char ftest[200], fflagants[200], fcalib[200];
+  float sfreq = 1498.75;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
+    {
+      switch (arg)
+	{
+	case 'c':
+	  if (optarg)
+	    {
+	      core = atoi(optarg);
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-c flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'i':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &in_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-i flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'o':
+	  if (optarg)
+	    {
+	      if (sscanf (optarg, "%x", &out_key) != 1) {
+		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-o flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 't':
+	  if (optarg)
+            {
+	      test = 1;
+	      syslog(LOG_INFO, "test mode");
+	      if (sscanf (optarg, "%s", &ftest) != 1) {
+		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-t flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'a':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "read calib file %s",optarg);
+	      if (sscanf (optarg, "%s", &fcalib) != 1) {
+		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-a flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'f':
+	  if (optarg)
+            {
+	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
+	      if (sscanf (optarg, "%s", &fflagants) != 1) {
+		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
+		return EXIT_FAILURE;
+	      }
+	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-f flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 's':
+	  if (optarg)
+            {
+	      sfreq = atof(optarg);
+	      syslog(LOG_INFO, "start freq %g",sfreq);
+ 	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-s flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
+	case 'd':
+	  //DEBUG=1;
+	  syslog (LOG_DEBUG, "Will excrete all debug messages");
+	  break;
+	case 'b':
+	  bf=1;
+	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
+	  break;
+	case 'h':
+	  usage();
+	  return EXIT_SUCCESS;
+	}
+    }
+
+  // Bind to cpu core
+  if (core >= 0) {
+    if (dada_bind_thread_to_core(core) < 0)
+      syslog(LOG_ERR,"failed to bind to core %d", core);
+    syslog(LOG_NOTICE,"bound to core %d", core);
+  }
+
+  /*
+  // allocate device memory
+  dmem d;
+  initialize_device_memory(&d,bf);
+
+  // set up for beamformer
+  FILE *ff;
+  int iii;
+  if (bf) {
+
+    if (!(ff=fopen(fflagants,"r"))) {
+      syslog(LOG_ERR,"could not open flagants file\n");
+      exit(1);
+    }
+    d.nflags=0;
+    while (!feof(ff)) {
+      fscanf(ff,"%d\n",&d.flagants[iii]);
+      d.nflags++;
+    }
+    fclose(ff);
+
+    if (!(ff=fopen(fcalib,"rb"))) {
+      syslog(LOG_ERR,"could not open calibss file\n");
+      exit(1);
+    }
+    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
+    fclose(ff);
+
+    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
+      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
+    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
+
+    // calculate weights
+    calc_weights(&d);
+    
+  }
+
+  // test mode
+  FILE *fin, *fout;
+  uint64_t output_size;
+  char * output_data;//, * o1;
+  if (test) {
+
+    // read one block of input data    
+    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+    for (int i=0;i<512;i++) {
+      fin = fopen(ftest,"rb");
+      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
+      fclose(fin);
+    }
+
+    // run correlator or beamformer, and output data
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+
+      fout = fopen("output.dat","wb");
+      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+      fclose(fout);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
+      output_data = (char *)malloc(output_size);
+      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
+
+      // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
+      // o1 = (char *)malloc(output_size);
+      // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);
+	
+      
+
+      fout = fopen("output.dat","wb");
+      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
+      //fwrite(o1,1,output_size,fout);
+      fclose(fout);
+    }
+
+	
+    // free
+    free(d.h_input);
+    free(output_data);
+    //free(o1);
+    deallocate_device_memory(&d,bf);
+
+    exit(1);
+  }
+  
+
+
+  
+  // DADA stuff
+  
+  syslog (LOG_INFO, "creating in and out hdus");
+  
+  hdu_in  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_in, in_key);
+  if (dada_hdu_connect (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not connect to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_read (hdu_in) < 0) {
+    syslog (LOG_ERR,"could not lock to dada buffer in");
+    return EXIT_FAILURE;
+  }
+  
+  hdu_out  = dada_hdu_create (0);
+  dada_hdu_set_key (hdu_out, out_key);
+  if (dada_hdu_connect (hdu_out) < 0) {
+    syslog (LOG_ERR,"could not connect to output  buffer");
+    return EXIT_FAILURE;
+  }
+  if (dada_hdu_lock_write(hdu_out) < 0) {
+    syslog (LOG_ERR, "could not lock to output buffer");
+    return EXIT_FAILURE;
+  }
+
+  uint64_t header_size = 0;
+
+  // deal with headers
+  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
+  if (!header_in)
+    {
+      syslog(LOG_ERR, "could not read next header");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block cleared");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  
+  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
+  if (!header_out)
+    {
+      syslog(LOG_ERR, "could not get next header block [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+  memcpy (header_out, header_in, header_size);
+  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
+    {
+      syslog (LOG_ERR, "could not mark header block filled [output]");
+      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+      return EXIT_FAILURE;
+    }
+
+  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
+  
+  // get block sizes and allocate memory
+  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
+  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  if (bf==0) 
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
+  else
+    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
+  uint64_t  bytes_read = 0;
+  //char * block;
+  char * output_buffer;
+  output_buffer = (char *)malloc(block_out);
+  uint64_t written, block_id;
+  
+  // get things started
+  bool observation_complete=0;
+  //bool started = 0;
+  syslog(LOG_INFO, "starting observation");
+  int blocks = 0;
+  //clock_t begin, end;
+  //double time_spent;
+  
+  while (!observation_complete) {
+
+    if (DEBUG) syslog(LOG_INFO,"reading block");    
+    
+    // open block
+    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
+
+    // do stuff
+    //begin = clock();
+    // loop
+    if (bf==0) {
+      if (DEBUG) syslog(LOG_INFO,"run correlator");
+      dcorrelator(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
+    }
+    else {
+      if (DEBUG) syslog(LOG_INFO,"run beamformer");
+      dbeamformer(&d);
+      if (DEBUG) syslog(LOG_INFO,"copy to host");
+      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
+    }
+    //end = clock();
+    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
+    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
+    
+    // write to output
+
+    // write to host
+    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
+    if (written < block_out)
+      {
+	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
+	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+	return EXIT_FAILURE;
+      }
+    
+    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
+    blocks++;
+    // loop end
+    
+      
+    // finish up
+    if (bytes_read < block_size)
+      observation_complete = 1;
+    
+    ipcio_close_block_read (hdu_in->data_block, bytes_read);
+    
+  }
+
+  // finish up
+  free(output_buffer);
+  deallocate_device_memory(&d,bf);
+  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
+  
+  return 0;
+  */
+}
diff --git a/utils/.gitignore b/tests/utils/.gitignore
similarity index 100%
rename from utils/.gitignore
rename to tests/utils/.gitignore
diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
new file mode 100644
index 0000000..226c9de
--- /dev/null
+++ b/tests/utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+# install step for utils
+#------------------------------
+set(DSA_XENGINE_UTILS
+  # cmake-format: sortable
+  gen_packet.py
+  get_rms_packet.py
+  get_rms.py
+  sockets.py
+  )
+install(FILES ${DSA_XENGINE_UTILS} DESTINATION utils)
+#------------------------------
diff --git a/tests/utils/CMakeLists.txt~ b/tests/utils/CMakeLists.txt~
new file mode 100644
index 0000000..ab053c5
--- /dev/null
+++ b/tests/utils/CMakeLists.txt~
@@ -0,0 +1,22 @@
+# install step for utils
+#------------------------------
+set(DSA_XENGINE_UTILS
+  # cmake-format: sortable
+/home/dmhowart/DSA110/dsa110-xengine/src/dsaX_bfCorr.cu  dsaX_capture.h
+  dsaX_capture_manythread.h
+  dsaX_capture_pcap.h
+  dsaX_def.h
+  dsaX_cutlass_interface.h
+  )
+install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+#------------------------------
+
+# install step for executables
+#-----------------------------
+install(TARGETS
+  # cmake-format: sortable
+  dsaX_bfCorr
+  RUNTIME DESTINATION
+  bin
+  )
+#-----------------------------
diff --git a/utils/gen_packet.py b/tests/utils/gen_packet.py
similarity index 100%
rename from utils/gen_packet.py
rename to tests/utils/gen_packet.py
diff --git a/utils/get_rms.py b/tests/utils/get_rms.py
similarity index 100%
rename from utils/get_rms.py
rename to tests/utils/get_rms.py
diff --git a/utils/get_rms_packet.py b/tests/utils/get_rms_packet.py
similarity index 100%
rename from utils/get_rms_packet.py
rename to tests/utils/get_rms_packet.py
diff --git a/utils/packet.out b/tests/utils/packet.out
similarity index 100%
rename from utils/packet.out
rename to tests/utils/packet.out
diff --git a/utils/sockets.py b/tests/utils/sockets.py
similarity index 100%
rename from utils/sockets.py
rename to tests/utils/sockets.py
diff --git a/utils/test.out b/tests/utils/test.out
similarity index 100%
rename from utils/test.out
rename to tests/utils/test.out

From b7789e216bb6e56286fada15eabe874e69b803c5 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Mon, 24 Jun 2024 15:17:19 -0700
Subject: [PATCH 19/30] Rename psrdada header file, split beamformer and
 correlator files (correlator is now pure cpp code, independent of platform).
 Create a test file independent of platform and psrdada, restore accidentally
 deleted utils

---
 README.md                                     |   9 +-
 include/CMakeLists.txt                        |   1 +
 include/dsaX.h                                |  32 +-
 include/dsaX_cublas_interface.h               |   2 +-
 include/dsaX_cuda_interface.h                 |  27 +-
 include/dsaX_def.h                            |   3 +-
 ...psrdada_headers.h => dsaX_psrdada_utils.h} |   4 +
 include/dsaX_utils.h                          |   2 +-
 src/CMakeLists.txt                            |  20 +-
 src/dsaX_beamformer.cu                        | 168 +++++
 src/dsaX_beamformer_correlator.cu             | 612 ------------------
 src/dsaX_blas_interface.cu                    |   2 +-
 src/dsaX_correlator.cpp                       |  59 ++
 src/dsaX_cublas_interface.cu                  |  35 +-
 src/dsaX_cuda_interface.cu                    |  66 +-
 src/dsaX_utils.cpp                            |   9 +
 tests/CMakeLists.txt                          |   6 +-
 tests/dsaX_beamformer_correlator_test.cpp     | 399 ------------
 tests/dsaX_correlator_test.cpp                | 195 ++++++
 utils/gen_packet.py                           | 228 +++++++
 utils/gen_testblock.py                        |  49 ++
 utils/get_rms.py                              | 141 ++++
 utils/get_rms_packet.py                       |  36 ++
 utils/sockets.py                              |  31 +
 24 files changed, 1024 insertions(+), 1112 deletions(-)
 rename include/{dsaX_psrdada_headers.h => dsaX_psrdada_utils.h} (70%)
 create mode 100644 src/dsaX_beamformer.cu
 delete mode 100644 src/dsaX_beamformer_correlator.cu
 create mode 100644 src/dsaX_correlator.cpp
 delete mode 100644 tests/dsaX_beamformer_correlator_test.cpp
 create mode 100644 tests/dsaX_correlator_test.cpp
 create mode 100644 utils/gen_packet.py
 create mode 100644 utils/gen_testblock.py
 create mode 100644 utils/get_rms.py
 create mode 100644 utils/get_rms_packet.py
 create mode 100644 utils/sockets.py

diff --git a/README.md b/README.md
index 4a27ba5..f771017 100644
--- a/README.md
+++ b/README.md
@@ -71,11 +71,4 @@ Finally, `dsaX_dbnic` and `dsaX_nicdb` implement the corner turn to feed `mbheim
 
 ### scripts and utils
 
-The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. 
-
-
-
- 
- 
- 
-
+The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights.
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 1bbdfda..a056a0f 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -4,6 +4,7 @@ enable_language(CUDA)
 #------------------------------
 set(DSA_XENGINE_HEADERS
   # cmake-format: sortable
+  dsaX_cuda_interface.h
   dsaX_cuda_headers.h
   dsaX_capture.h
   dsaX_capture_manythread.h
diff --git a/include/dsaX.h b/include/dsaX.h
index 2ee856a..7cf23dc 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -1,24 +1,9 @@
 #pragma once 
 
-#include <iostream>
-#include <algorithm>
 #include <complex>
-#include <vector>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cmath>
-#include <string.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <time.h>
-#include <syslog.h>
-#include <pthread.h>
 
+#include "dsaX_enums.h"
 #include "dsaX_cuda_headers.h"
-#include "dsaX_psrdada_headers.h"
 
 // required to prevent overflow in corr matrix multiply
 #define halfFac 4
@@ -26,9 +11,6 @@
 // beam sep
 #define sep 1.0 // arcmin
 
-/* global variables */
-//#define DEBUG;
-
 // define structure that carries around device memory
 typedef struct dmem {
 
@@ -91,15 +73,3 @@ typedef struct dsaXBLASParam_s {
   dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
   
 } dsaXBLASParam;
-  
-
-// Initialise device memory
-void initialize_device_memeory(dmem * d, int bf);
-
-// Deallocate device memory
-void deallocate(dmem * d, int bf);
-
-void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
-
-// ?
-int dada_bind_thread_to_core(int core);
diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h
index 9265f37..7ad8b31 100644
--- a/include/dsaX_cublas_interface.h
+++ b/include/dsaX_cublas_interface.h
@@ -2,4 +2,4 @@
 #include "dsaX.h"
 #include "dsaX_cuda_headers.h"
 
-void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param);
+void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam param);
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
index 99b1db2..c8ea8aa 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/dsaX_cuda_interface.h
@@ -1,31 +1,32 @@
 #pragma once
 
-#include "dsaX.h"
 #include "dsaX_def.h"
+#include "dsaX.h"
 
-void initialize_device_memory(dmem * d, int bf);
+#ifdef DSA_XENGINE_TARGET_CUDA
+void initialize_device_memory(dmem *d, int bf);
 
-void deallocate_device_memory(dmem * d, int bf);
+void deallocate_device_memory(dmem *d, int bf);
 
-void reorder_output_device(dmem * d);
+void reorder_output_device(dmem *d);
 
 __global__ void corr_input_copy(char *input, half *inr, half *ini);
 
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata);
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec *idata, out_prec *odata);
 
-void reorder_input_device(char *input, char * tx, half *inr, half *ini);
+void reorder_input_device(char *input, char *tx, half *inr, half *ini);
 
 __global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup);
 
-__global__ void transpose_input_bf(double * idata, double * odata);
-
-__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs);
+__global__ void transpose_input_bf(double *idata, double *odata);
 
-void calc_weights(dmem * d);
+__global__ void populate_weights_matrix(float *antpos_e, float *antpos_n, float *calibs, half *wr, half *wi, float *fqs);
 
-__global__ void fluff_input_bf(char * input, half * dr, half * di);
+void calc_weights(dmem *d);
 
-__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata);
+__global__ void fluff_input_bf(char *input, half *dr, half *di);
 
-__global__ void sum_beam(unsigned char * input, float * output);
+__global__ void transpose_scale_bf(half *ir, half *ii, unsigned char *odata);
 
+__global__ void sum_beam(unsigned char *input, float *output);
+#endif
diff --git a/include/dsaX_def.h b/include/dsaX_def.h
index c23ed15..257f493 100644
--- a/include/dsaX_def.h
+++ b/include/dsaX_def.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "dada_def.h"
-
 // default dada block keys
 #define TEST_BLOCK_KEY 0x0000aada // for capture program.
 // 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
@@ -38,6 +36,7 @@
 #define XGPU_IN_INC 1 // size of input increment
 #define NBASE 4656 // nant*(nant+1)/2
 #define NPOL 2
+#define NCOMPLEX 2 // two reals per complex
 #define NCHAN 1536 // regardless of NCHANG
 
 // default port for packet capture
diff --git a/include/dsaX_psrdada_headers.h b/include/dsaX_psrdada_utils.h
similarity index 70%
rename from include/dsaX_psrdada_headers.h
rename to include/dsaX_psrdada_utils.h
index 325dcb8..2dc3dec 100644
--- a/include/dsaX_psrdada_headers.h
+++ b/include/dsaX_psrdada_utils.h
@@ -10,3 +10,7 @@
 #include "ascii_header.h"
 #include "dsaX_def.h"
 #include "dsaX_enums.h"
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+
+int dada_bind_thread_to_core(int core);
diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h
index 3976db7..5d39861 100644
--- a/include/dsaX_utils.h
+++ b/include/dsaX_utils.h
@@ -6,4 +6,4 @@ void dsaXmemset(void *array, int ch, size_t n);
 
 void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n);
 void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n);
-
+void dsaXmemcpyDeviceToDevice(void *array_device_to, void *array_device_from, size_t n);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e23827b..c73743a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,9 +6,12 @@ include_directories(../include)
 #-----------------------------
 add_library(dsa SHARED
   dsaX_cuda_interface.cu
+  dsaX_cublas_interface.cu
   dsaX_blas_interface.cu
-  dsaX_beamformer_correlator.cu
+  dsaX_beamformer.cu
+  dsaX_correlator.cpp
   dsaX_utils.cpp
+  dsaX_psrdada_utils.cpp
   )
 
 if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
@@ -19,13 +22,13 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU)
 endif()
 
 if(CUDAToolkit_FOUND)
-  target_link_libraries(dsa CUDA::cudart)
+  target_link_libraries(dsa PUBLIC CUDA::cudart)
 endif()
 
 if(DSA_XENGINE_ENABLE_PSRDADA)
   include_directories(${PSRDada_SOURCE_DIR}/src)
   set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-  target_link_libraries(dsa ${PSRDada_LIB})
+  target_link_libraries(dsa PUBLIC ${PSRDada_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_XGPU) 
@@ -58,11 +61,6 @@ if(DSA_XENGINE_ENABLE_CUTLASS)
   target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB})
   #---------------------------------------  
 endif()
-
-if(CUDAToolkit_FOUND)
-  #add_executable(dsaX_beamformer_correlator dsaX_beamformer_correlator.cu)
-  #target_link_libraries(dsaX_beamformer_correlator ${dsa} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
-endif()
 #---------------------
 
 # install step for libraray
@@ -75,7 +73,6 @@ install(TARGETS
   )
 #-----------------------------
 
-
 # install step for executables
 #-----------------------------
 install(TARGETS
@@ -85,3 +82,8 @@ install(TARGETS
   bin
   )
 #-----------------------------
+
+if(CUDAToolkit_FOUND)
+  add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu)
+  target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsa ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+endif()
diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu
new file mode 100644
index 0000000..0d7b1df
--- /dev/null
+++ b/src/dsaX_beamformer.cu
@@ -0,0 +1,168 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+
+#include <iostream>
+
+#include "dsaX_def.h"
+#include "dsaX.h"
+#include "dsaX_blas_interface.h"
+#include "dsaX_utils.h"
+#include "dsaX_psrdada_utils.h"
+#ifdef DSA_XENGINE_TARGET_CUDA
+#include "dsaX_cuda_interface.h"
+#endif
+
+using namespace std;
+
+int DEBUG = 1;
+
+void usage() {
+  fprintf (stdout,
+	   "dsaX_beamformer_correlator [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default REORDER_BLOCK_KEY]\n"
+	   " -o out_key [default XGPU_BLOCK_KEY]\n"
+	   " -b run beamformer [default is to run correlator]\n"
+	   " -h print usage\n"
+	   " -t binary file for test mode\n"
+	   " -f flagants file\n"
+	   " -a calib file\n"
+	   " -s start frequency (assumes -0.244140625MHz BW)\n");
+}
+
+
+/*
+Beamformer:
+ - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
+ - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+ - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
+(single transpose operation)
+ - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
+ - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
+ - transpose and done! 
+
+*/
+// beamformer function
+void dbeamformer(dmem *d) {
+
+  // gemm settings - recall column major order assumed
+  // stride over 48 chans
+  cublasHandle_t cublasH = NULL;
+  cublasCreate(&cublasH);
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  const int m = NPACKETS_PER_BLOCK/4;
+  const int n = NBEAMS/2;
+  const int k = 4*(NANTS/2)*8*2*2;
+  const half alpha = 1.;
+  const half malpha = -1.;
+  const int lda = k;
+  const int ldb = k;
+  const half beta0 = 0.;
+  const half beta1 = 1.;
+  const int ldc = m;
+  const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
+  const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
+  const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
+  const int batchCount = NCHAN_PER_PACKET/8;
+  long long int i1, i2;//, o1;
+  
+  // create streams
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // timing
+  // copy, prepare, cublas, output
+  clock_t begin, end;
+
+  // do big memcpy
+  begin = clock();
+  dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4);
+  end = clock();
+  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+  
+  // loop over halves of the array
+  for (int iArm=0;iArm<2;iArm++) {
+  
+    // zero out output arrays
+    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
+    cudaDeviceSynchronize();
+    
+    // copy data to device
+    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+    // final data: need to split by NANTS.
+    begin = clock();
+    for (i1=0; i1<NPACKETS_PER_BLOCK; i1++) 
+      dsaXmemcpyDeviceToDevice(d->d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,
+			       d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,
+			       (NANTS/2)*NCHAN_PER_PACKET*4);
+    end = clock();
+    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // do reorder and fluff of data to real and imag
+    begin = clock();
+    
+    dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);    
+    transpose_input_bf<<< dimGrid1, dimBlock1 >>>((double *)(d->d_input), (double *)(d->d_tx));    
+    fluff_input_bf<<<NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128,128>>>(d->d_tx, d->d_br, d->d_bi);
+    
+    end = clock();
+    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
+
+    // large matrix multiply to get real and imag outputs
+    // set up for gemm
+    cublasSetStream(cublasH, stream);
+    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
+    
+    // run strided batched gemm
+    begin = clock();
+    // ac
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_br,lda,strideA,
+			      d->weights_r+i2,ldb,strideB,&beta0,
+			      d->d_bigbeam_r,ldc,strideC,
+			      batchCount);
+    // -bd
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &malpha,d->d_bi,lda,strideA,
+			      d->weights_i+i2,ldb,strideB,&beta1,
+			      d->d_bigbeam_r,ldc,strideC,
+			      batchCount);
+    // bc
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_bi,lda,strideA,
+			      d->weights_r+i2,ldb,strideB,&beta0,
+			      d->d_bigbeam_i,ldc,strideC,
+			      batchCount);
+    // ad
+    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
+			      &alpha,d->d_br,lda,strideA,
+			      d->weights_i+i2,ldb,strideB,&beta1,
+			      d->d_bigbeam_i,ldc,strideC,
+			      batchCount);
+      
+    cudaDeviceSynchronize();
+    end = clock();
+    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
+        
+    // simple formation of total power and scaling to 8-bit in transpose kernel
+    begin = clock();
+    dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16);
+    transpose_scale_bf<<<dimGrid,dimBlock>>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+    end = clock();
+    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
+  }
+
+  cudaStreamDestroy(stream);
+  cublasDestroy(cublasH);
+
+  // form sum over times
+  //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf);
+}
diff --git a/src/dsaX_beamformer_correlator.cu b/src/dsaX_beamformer_correlator.cu
deleted file mode 100644
index ddbc73c..0000000
--- a/src/dsaX_beamformer_correlator.cu
+++ /dev/null
@@ -1,612 +0,0 @@
-// -*- c++ -*-
-/* assumes input and output block size is appropriate - will seg fault otherwise*/
-/*
-Workflow is similar for BF and corr applications
- - copy data to GPU, convert to half-precision and calibrate while reordering
- - do matrix operations to populate large output vector
- */
-
-#include "dsaX_def.h"
-#include "dsaX.h"
-#include "dsaX_blas_interface.h"
-#include "dsaX_utils.h"
-#include "dsaX_blas_interface.h"
-#ifdef DSA_XENGINE_TARGET_CUDA
-#include "dsaX_cuda_interface.h"
-#endif
-
-int DEBUG = 1;
-
-void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out)
-{
-  if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in");
-  dada_hdu_destroy (in);
-  
-  if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out");
-  dada_hdu_destroy (out);
-  
-} 
-
-void usage() {
-  fprintf (stdout,
-	   "dsaX_beamformer_correlator [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default REORDER_BLOCK_KEY]\n"
-	   " -o out_key [default XGPU_BLOCK_KEY]\n"
-	   " -b run beamformer [default is to run correlator]\n"
-	   " -h print usage\n"
-	   " -t binary file for test mode\n"
-	   " -f flagants file\n"
-	   " -a calib file\n"
-	   " -s start frequency (assumes -0.244140625MHz BW)\n");
-}
-
-// correlator function
-// workflow: copy to device, reorder, stridedBatchedGemm, reorder
-void dcorrelator(dmem *d) {
-
-  // copy to device
-  dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-  
-  // zero out output arrays
-  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
-  
-  // reorder input
-  reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i);
-
-  dsaXBLASParam blas_param;
-  // gemm settings
-  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
-  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] 
-  blas_param.trans_a = DSA_BLAS_OP_N;
-  blas_param.trans_b = DSA_BLAS_OP_T;
-  blas_param.m = NANTS;
-  blas_param.n = NANTS;
-  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
-  blas_param.alpha = 1.0;
-  blas_param.lda = blas_param.m;
-  blas_param.ldb = blas_param.n;
-  blas_param.beta = 0.;
-  blas_param.ldc = blas_param.m;
-  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
-  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
-  blas_param.c_stride = NANTS*NANTS;
-  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
-
-  // Perform GEMM accoring to back end configuration
-  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
-  
-  /*
-  // ABSTRACT HERE START
-  // ABSTRACT HERE END
-  */
-  
-  // reorder output data
-  reorder_output_device(d);
-}
-
-/*
-Beamformer:
- - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
- - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
- - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
-(single transpose operation)
- - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
- - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
- - transpose and done! 
-
-*/
-// beamformer function
-void dbeamformer(dmem * d) {
-
-  // gemm settings - recall column major order assumed
-  // stride over 48 chans
-  cublasHandle_t cublasH = NULL;
-  cublasCreate(&cublasH);
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  const int m = NPACKETS_PER_BLOCK/4;
-  const int n = NBEAMS/2;
-  const int k = 4*(NANTS/2)*8*2*2;
-  const half alpha = 1.;
-  const half malpha = -1.;
-  const int lda = k;
-  const int ldb = k;
-  const half beta0 = 0.;
-  const half beta1 = 1.;
-  const int ldc = m;
-  const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
-  const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
-  const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
-  const int batchCount = NCHAN_PER_PACKET/8;
-  long long int i1, i2;//, o1;
-  
-  // create streams
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  // timing
-  // copy, prepare, cublas, output
-  clock_t begin, end;
-
-  // do big memcpy
-  begin = clock();
-  dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4);
-  end = clock();
-  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
-  
-  // loop over halves of the array
-  for (int iArm=0;iArm<2;iArm++) {
-  
-    // zero out output arrays
-    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
-    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
-    cudaDeviceSynchronize();
-    
-    // copy data to device
-    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-    // final data: need to split by NANTS.
-    begin = clock();
-    for (i1=0;i1<NPACKETS_PER_BLOCK;i1++) 
-      cudaMemcpy(d->d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,(NANTS/2)*NCHAN_PER_PACKET*4,cudaMemcpyDeviceToDevice);
-    end = clock();
-    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
-    
-    // do reorder and fluff of data to real and imag
-    begin = clock();
-    dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
-    transpose_input_bf<<<dimGrid1,dimBlock1>>>((double *)(d->d_input),(double *)(d->d_tx));
-    fluff_input_bf<<<NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128,128>>>(d->d_tx,d->d_br,d->d_bi);
-    end = clock();
-    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
-
-    // large matrix multiply to get real and imag outputs
-    // set up for gemm
-    cublasSetStream(cublasH, stream);
-    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
-          
-    // run strided batched gemm
-    begin = clock();
-    // ac
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_br,lda,strideA,
-			      d->weights_r+i2,ldb,strideB,&beta0,
-			      d->d_bigbeam_r,ldc,strideC,
-			      batchCount);
-    // -bd
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &malpha,d->d_bi,lda,strideA,
-			      d->weights_i+i2,ldb,strideB,&beta1,
-			      d->d_bigbeam_r,ldc,strideC,
-			      batchCount);
-    // bc
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_bi,lda,strideA,
-			      d->weights_r+i2,ldb,strideB,&beta0,
-			      d->d_bigbeam_i,ldc,strideC,
-			      batchCount);
-    // ad
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_br,lda,strideA,
-			      d->weights_i+i2,ldb,strideB,&beta1,
-			      d->d_bigbeam_i,ldc,strideC,
-			      batchCount);
-      
-    cudaDeviceSynchronize();
-    end = clock();
-    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
-      
-        
-    // simple formation of total power and scaling to 8-bit in transpose kernel
-    begin = clock();
-    dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16);
-    transpose_scale_bf<<<dimGrid,dimBlock>>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    end = clock();
-    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
-  }
-
-  cudaStreamDestroy(stream);
-  cublasDestroy(cublasH);
-
-  // form sum over times
-  //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf);
-  
-}
-
-
-// MAIN
-#if 0
-int main (int argc, char *argv[]) {
-
-  cudaSetDevice(0);
-  
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  /* DADA Header plus Data Unit */
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY;
-  key_t out_key = XGPU_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int bf = 0;
-  int test = 0;
-  char ftest[200], fflagants[200], fcalib[200];
-  float sfreq = 1498.75;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-            {
-	      test = 1;
-	      syslog(LOG_INFO, "test mode");
-	      if (sscanf (optarg, "%s", &ftest) != 1) {
-		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'a':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "read calib file %s",optarg);
-	      if (sscanf (optarg, "%s", &fcalib) != 1) {
-		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
-	      if (sscanf (optarg, "%s", &fflagants) != 1) {
-		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 's':
-	  if (optarg)
-            {
-	      sfreq = atof(optarg);
-	      syslog(LOG_INFO, "start freq %g",sfreq);
- 	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-s flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  //DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'b':
-	  bf=1;
-	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0)
-    {
-      if (dada_bind_thread_to_core(core) < 0)
-	syslog(LOG_ERR,"failed to bind to core %d", core);
-      syslog(LOG_NOTICE,"bound to core %d", core);
-    }
-
-  // allocate device memory
-  dmem d;
-  initialize_device_memory(&d,bf);
-
-  // set up for beamformer
-  FILE *ff;
-  int iii;
-  if (bf) {
-
-    if (!(ff=fopen(fflagants,"r"))) {
-      syslog(LOG_ERR,"could not open flagants file\n");
-      exit(1);
-    }
-    d.nflags=0;
-    while (!feof(ff)) {
-      fscanf(ff,"%d\n",&d.flagants[iii]);
-      d.nflags++;
-    }
-    fclose(ff);
-
-    if (!(ff=fopen(fcalib,"rb"))) {
-      syslog(LOG_ERR,"could not open calibss file\n");
-      exit(1);
-    }
-    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
-    fclose(ff);
-
-    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
-      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
-    dsaXmemcpyHostToDevice(d.d_freqs, d.h_freqs, sizeof(float)*(NCHAN_PER_PACKET/8));
-    
-    // calculate weights
-    calc_weights(&d);
-    
-  }
-
-  // test mode
-  FILE *fin, *fout;
-  uint64_t output_size;
-  char * output_data;//, * o1;
-  if (test) {
-
-    // read one block of input data    
-    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    for (int i=0;i<512;i++) {
-      fin = fopen(ftest,"rb");
-      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
-      fclose(fin);
-    }
-
-    // run correlator or beamformer, and output data
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-      output_data = (char *)malloc(output_size);
-      dsaXmemcpyDeviceToHost(output_data, d.d_output, output_size);
-
-      fout = fopen("output.dat","wb");
-      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
-      fclose(fout);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
-      output_data = (char *)malloc(output_size);
-      dsaXmemcpyDeviceToHost(output_data, d.d_bigpower, output_size);
-
-      /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
-      o1 = (char *)malloc(output_size);
-      cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/
-	
-      
-
-      fout = fopen("output.dat","wb");
-      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
-      //fwrite(o1,1,output_size,fout);
-      fclose(fout);
-    }
-
-	
-    // free
-    free(d.h_input);
-    free(output_data);
-    //free(o1);
-    deallocate_device_memory(&d,bf);
-
-    exit(1);
-  }
-  
-
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  if (bf==0) 
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
-  else
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
-  uint64_t  bytes_read = 0;
-  //char * block;
-  char * output_buffer;
-  output_buffer = (char *)malloc(block_out);
-  uint64_t written, block_id;
-  
-  // get things started
-  bool observation_complete=0;
-  //bool started = 0;
-  syslog(LOG_INFO, "starting observation");
-  int blocks = 0;
-  //clock_t begin, end;
-  //double time_spent;
-  
-  while (!observation_complete) {
-
-    if (DEBUG) syslog(LOG_INFO,"reading block");    
-    
-    // open block
-    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    // do stuff
-    //begin = clock();
-    // loop
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      dsaXmemcpyDeviceToHost(output_buffer, d.d_output, block_out);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      dsaMXmemcpyDeviceToHost(output_buffer, d.d_bigpower, block_out);
-    }
-    //end = clock();
-    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
-    
-    // write to output
-
-    // write to host
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
-    blocks++;
-    // loop end
-    
-      
-    // finish up
-    if (bytes_read < block_size)
-      observation_complete = 1;
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-  }
-
-  // finish up
-  free(output_buffer);
-  deallocate_device_memory(&d,bf);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-}
-#endif
-
diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu
index 430ba9e..7e49fcb 100644
--- a/src/dsaX_blas_interface.cu
+++ b/src/dsaX_blas_interface.cu
@@ -3,7 +3,7 @@
 
 void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) {
 #ifdef DSA_XENGINE_TARGET_CUDA
-  dsaXHgemmStridedBatchedCuda(real_in, imag_in, real_out, imag_out, param);
+  dsaXHgemmStridedBatchedCuda((half*)real_in, (half*)imag_in, (half*)real_out, (half*)imag_out, param);
 #else
   std::cout "Not implemented" << std::endl;
   exit(0);
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
new file mode 100644
index 0000000..d2223f5
--- /dev/null
+++ b/src/dsaX_correlator.cpp
@@ -0,0 +1,59 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+
+#include <iostream>
+
+#include "dsaX_def.h"
+#include "dsaX.h"
+#include "dsaX_blas_interface.h"
+#include "dsaX_utils.h"
+#include "dsaX_psrdada_utils.h"
+#include "dsaX_cuda_interface.h"
+
+// correlator function
+// workflow: copy to device, reorder, stridedBatchedGemm, reorder
+// DMH CUDA references excised
+void dcorrelator(dmem *d) {
+
+  // copy to device
+  dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  
+  // zero out output arrays
+  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  
+  // reorder input
+  reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i);
+
+  dsaXBLASParam blas_param;
+  // gemm settings
+  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] 
+  blas_param.trans_a = DSA_BLAS_OP_N;
+  blas_param.trans_b = DSA_BLAS_OP_T;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+
+  // Perform GEMM accoring to back end configuration
+  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
+  
+  // reorder output data
+  reorder_output_device(d);
+
+}
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
index 4631516..df6b3de 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/dsaX_cublas_interface.cu
@@ -1,6 +1,9 @@
+#include <iostream>
 #include "dsaX_cublas_interface.h"
 
-void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) {
+using namespace std;
+
+void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam blas_param) {
 #ifdef DSA_XENGINE_TARGET_CUDA
   
   // not sure if essential
@@ -24,7 +27,7 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v
   case DSA_BLAS_OP_C:
     transa = CUBLAS_OP_C; break;
   default:
-    std::cout << "Unknown cublas transpose" << std::end;
+    std::cout << "Unknown cublas transpose" << std::endl;
   }
 
   switch (blas_param.trans_b) {
@@ -35,14 +38,14 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v
   case DSA_BLAS_OP_C:
     transb = CUBLAS_OP_C; break;
   default:
-    std::cout << "Unknown cublas transpose" << std::end;
+    std::cout << "Unknown cublas transpose" << std::endl;
   }
   
   const int m = blas_param.m;
   const int n = blas_param.n;
   const int k = blas_param.k;
   const half alpha = blas_param.alpha.real();
-  const half malpha = -1.0 * alpha;
+  const half malpha = -1.0 * blas_param.alpha.real();
   const int lda = blas_param.lda;
   const int ldb = blas_param.ldb;
   const half beta0 = blas_param.beta.real();
@@ -56,27 +59,27 @@ void dsaXHgemmStridedBatchedCuda(void *real_in, void *imag_in, void *real_out, v
   // run strided batched gemm for datatype (a + ib)(c + id)
   // ac
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_r,lda,strideA,
-			    d->d_r,ldb,strideB,&beta0,
-			    d->d_outr,ldc,strideC,
+			    &alpha,real_in,lda,strideA,
+			    real_in,ldb,strideB,&beta0,
+			    real_out,ldc,strideC,
 			    batchCount);
   // bd
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_i,lda,strideA,
-			    d->d_i,ldb,strideB,&beta1,
-			    d->d_outr,ldc,strideC,
+			    &alpha,imag_in,lda,strideA,
+			    imag_in,ldb,strideB,&beta1,
+			    real_out,ldc,strideC,
 			    batchCount);
   // -bc
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &malpha,d->d_i,lda,strideA,
-			    d->d_r,ldb,strideB,&beta0,
-			    d->d_outi,ldc,strideC,
+			    &malpha,imag_in,lda,strideA,
+			    real_in,ldb,strideB,&beta0,
+			    imag_out,ldc,strideC,
 			    batchCount);
   // ad
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,d->d_r,lda,strideA,
-			    d->d_i,ldb,strideB,&beta1,
-			    d->d_outi,ldc,strideC,
+			    &alpha,real_in,lda,strideA,
+			    imag_in,ldb,strideB,&beta1,
+			    imag_out,ldc,strideC,
 			    batchCount);
 
   // shown to be essential
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index 31e44d0..d1f77a4 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -1,7 +1,7 @@
 #include "dsaX_cuda_interface.h"
 
 // allocate device memory
-void initialize_device_memory(dmem * d, int bf) {
+void initialize_device_memory(dmem *d, int bf) {
   
   // for correlator
   if (bf==0) {
@@ -45,9 +45,8 @@ void initialize_device_memory(dmem * d, int bf) {
     
   }  
 }
-
 // deallocate device memory
-void deallocate_device_memory(dmem * d, int bf) {
+void deallocate_device_memory(dmem *d, int bf) {
   
   cudaFree(d->d_input);
 
@@ -149,25 +148,49 @@ void reorder_output_device(dmem * d) {
   fout=fopen("test3.test","wb");
   fwrite(odata,sizeof(char),384*4*NBASE*4,fout);
   fclose(fout);*/
-
   
   cudaFree(d_idxs);
   free(h_idxs);
   //cudaStreamDestroy(stream);  
-
 }
 
 // kernel to fluff input
 // run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
 __global__ void corr_input_copy(char *input, half *inr, half *ini) {
 
-  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
-  int tidx = threadIdx.x; // assume 128
+  int bidx = blockIdx.x;  // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
+  int tidx = threadIdx.x; // assume 128 threads per block
   int iidx = bidx*128+tidx;
-  
-  inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
-  ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
 
+  // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+  // to get real part 4 bit data.
+  // 0000rrrr
+  // Bit shift this result by 4 to the left.
+  // rrrr0000
+  // Cast to signed char.
+  // +-rrr0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000rrr
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  inr[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(15)  ) << 4) >> 4));
+
+  // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+  // to get imag part 4 bit data
+  // iiii0000.
+  // Cast to signed char
+  // +-iii0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000iii
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
+
+  // Both results should be half (FP16) integers between -8 and 7.
+  half re = inr[iidx];
+  half im = ini[iidx];
+  half lim = 2.;
+  if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+    //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+  }
 }
 
 // transpose kernel
@@ -206,8 +229,8 @@ void reorder_input_device(char *input, char * tx, half *inr, half *ini) {
 
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix<<<dimGrid,dimBlock>>>(input,tx);
-  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128,128>>>(tx,inr,ini);
+  transpose_matrix<<<dimGrid,dimBlock>>>(input, tx);
+  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128, 128>>>(tx, inr, ini);
 }
 
 // kernel to help with reordering output
@@ -227,7 +250,8 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind
   int pol = (int)(chpol % 2);
 
   float v1=0., v2=0.;
-  
+
+  // Use CUDA casting intrinsic __half2float
   for (int i=0;i<halfFac;i++) {
     v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
     v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
@@ -240,11 +264,12 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind
 
 // kernels to reorder and fluff input data for beamformer
 // initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
-// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]      // run as 16x16 tiled transpose with 32-byte words 
+// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]
+// run as 16x16 tiled transpose with 32-byte words 
 // launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
 // here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
 // dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
-__global__ void transpose_input_bf(double * idata, double * odata) {
+__global__ void transpose_input_bf(double *idata, double *odata) {
 
   __shared__ double tile[16][17][4];
   
@@ -331,7 +356,7 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa
 // sequential pairs of eastings and northings
 // then [NANTS, 48, R/I] calibs
 
-void calc_weights(dmem * d) {
+void calc_weights(dmem *d) {
 
   // allocate
   float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
@@ -402,6 +427,15 @@ __global__ void fluff_input_bf(char * input, half * dr, half * di) {
 
   dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
   di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+
+  // Both results should be half (FP16) integers between -8 and 7.
+  //half re = dr[idx];
+  //half im = di[idx];
+  //half lim = 0;
+  //if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+  //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+  //}
+
   
 }
 
diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp
index 46abfc9..fc0345a 100644
--- a/src/dsaX_utils.cpp
+++ b/src/dsaX_utils.cpp
@@ -28,3 +28,12 @@ void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){
   memcpy(array_host, array_device, n);
 #endif
 }
+
+void dsaXmemcpyDeviceToDevice(void *array_copy_to, void *array_copy_from, size_t n){
+#ifdef DSA_XENGINE_TARGET_CUDA
+  // Perform device to device memcopy on data
+  cudaMemcpy(array_copy_to, array_copy_from, n, cudaMemcpyDeviceToDevice);
+#else
+  memcpy(array_copy_to, array_copy_from, n);
+#endif
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9d29854..4a45a24 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
-
-#include_directories(../include)
+#DMH: fix include path
+include_directories(../include)
 include_directories(${CLI11_SOURCE_DIR}/include/CLI)
-add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp)
+add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
 
diff --git a/tests/dsaX_beamformer_correlator_test.cpp b/tests/dsaX_beamformer_correlator_test.cpp
deleted file mode 100644
index 3e723d0..0000000
--- a/tests/dsaX_beamformer_correlator_test.cpp
+++ /dev/null
@@ -1,399 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include <math.h>
-#include <string.h>
-
-// Include the dsaX.h header in your application
-//#include <dsaX.h>
-
-int main(int argc, char **argv) {
-
-  /*
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  // DADA Header plus Data Unit 
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY;
-  key_t out_key = XGPU_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int bf = 0;
-  int test = 0;
-  char ftest[200], fflagants[200], fcalib[200];
-  float sfreq = 1498.75;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-            {
-	      test = 1;
-	      syslog(LOG_INFO, "test mode");
-	      if (sscanf (optarg, "%s", &ftest) != 1) {
-		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'a':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "read calib file %s",optarg);
-	      if (sscanf (optarg, "%s", &fcalib) != 1) {
-		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
-	      if (sscanf (optarg, "%s", &fflagants) != 1) {
-		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 's':
-	  if (optarg)
-            {
-	      sfreq = atof(optarg);
-	      syslog(LOG_INFO, "start freq %g",sfreq);
- 	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-s flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  //DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'b':
-	  bf=1;
-	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0) {
-    if (dada_bind_thread_to_core(core) < 0)
-      syslog(LOG_ERR,"failed to bind to core %d", core);
-    syslog(LOG_NOTICE,"bound to core %d", core);
-  }
-
-  
-  // allocate device memory
-  dmem d;
-  initialize_device_memory(&d,bf);
-
-  // set up for beamformer
-  FILE *ff;
-  int iii;
-  if (bf) {
-
-    if (!(ff=fopen(fflagants,"r"))) {
-      syslog(LOG_ERR,"could not open flagants file\n");
-      exit(1);
-    }
-    d.nflags=0;
-    while (!feof(ff)) {
-      fscanf(ff,"%d\n",&d.flagants[iii]);
-      d.nflags++;
-    }
-    fclose(ff);
-
-    if (!(ff=fopen(fcalib,"rb"))) {
-      syslog(LOG_ERR,"could not open calibss file\n");
-      exit(1);
-    }
-    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
-    fclose(ff);
-
-    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
-      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
-    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
-
-    // calculate weights
-    calc_weights(&d);
-    
-  }
-
-  // test mode
-  FILE *fin, *fout;
-  uint64_t output_size;
-  char * output_data;//, * o1;
-  if (test) {
-
-    // read one block of input data    
-    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    for (int i=0;i<512;i++) {
-      fin = fopen(ftest,"rb");
-      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
-      fclose(fin);
-    }
-
-    // run correlator or beamformer, and output data
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
-
-      fout = fopen("output.dat","wb");
-      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
-      fclose(fout);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
-
-      // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
-      // o1 = (char *)malloc(output_size);
-      // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);
-	
-      
-
-      fout = fopen("output.dat","wb");
-      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
-      //fwrite(o1,1,output_size,fout);
-      fclose(fout);
-    }
-
-	
-    // free
-    free(d.h_input);
-    free(output_data);
-    //free(o1);
-    deallocate_device_memory(&d,bf);
-
-    exit(1);
-  }
-  
-
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  if (bf==0) 
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
-  else
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
-  uint64_t  bytes_read = 0;
-  //char * block;
-  char * output_buffer;
-  output_buffer = (char *)malloc(block_out);
-  uint64_t written, block_id;
-  
-  // get things started
-  bool observation_complete=0;
-  //bool started = 0;
-  syslog(LOG_INFO, "starting observation");
-  int blocks = 0;
-  //clock_t begin, end;
-  //double time_spent;
-  
-  while (!observation_complete) {
-
-    if (DEBUG) syslog(LOG_INFO,"reading block");    
-    
-    // open block
-    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    // do stuff
-    //begin = clock();
-    // loop
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
-    }
-    //end = clock();
-    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
-    
-    // write to output
-
-    // write to host
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
-    blocks++;
-    // loop end
-    
-      
-    // finish up
-    if (bytes_read < block_size)
-      observation_complete = 1;
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-  }
-
-  // finish up
-  free(output_buffer);
-  deallocate_device_memory(&d,bf);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-  return 0;
-  */
-}
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
new file mode 100644
index 0000000..b0560fc
--- /dev/null
+++ b/tests/dsaX_correlator_test.cpp
@@ -0,0 +1,195 @@
+#include <unistd.h> //DMH: replace with CLI
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <syslog.h>
+
+// Include the dsaX_interface.h header in your application
+#include <dsaX_interface.h>
+
+using namespace std;
+
+void usage() {
+  fprintf (stdout,
+	   "dsaX_beamformer_correlator [options]\n"
+	   " -c if dsaX is CUDA enabled, use this GPU"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default REORDER_BLOCK_KEY]\n"
+	   " -o out_key [default XGPU_BLOCK_KEY]\n"
+	   " -h print usage\n"
+	   " -t binary file for test mode\n"
+	   " -f flagants file\n"
+	   " -a calib file\n"
+	   " -s start frequency (assumes -0.244140625MHz BW)\n");
+}
+
+int main(int argc, char **argv) {
+
+  // data block HDU keys
+  key_t in_key = REORDER_BLOCK_KEY;
+  key_t out_key = XGPU_BLOCK_KEY;
+  
+  // command line arguments
+  int device_ordinal = 0;
+  int arg = 0;
+  int bf = 0;
+  char ftest[200], fflagants[200], fcalib[200];
+  float sfreq = 1498.75;
+  
+  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) {
+    switch (arg) {
+    case 'c':
+      if (optarg) {
+	device_ordinal = atoi(optarg);
+	break;
+      }
+      else {
+	syslog(LOG_ERR,"-c flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 'i':
+      if (optarg) {
+	if (sscanf (optarg, "%x", &in_key) != 1) {
+	  syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+	  return EXIT_FAILURE;
+	}
+	break;
+      } else {
+	syslog(LOG_ERR,"-i flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 'o':
+      if (optarg) {
+	if (sscanf (optarg, "%x", &out_key) != 1) {
+	  syslog(LOG_ERR, "could not parse key from %s\n", optarg);
+	  return EXIT_FAILURE;
+	}
+	break;
+      } else {
+	syslog(LOG_ERR,"-o flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 't':
+      if (optarg) {
+	syslog(LOG_INFO, "test mode");
+	if (sscanf (optarg, "%s", &ftest) != 1) {
+	  syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
+	  return EXIT_FAILURE;
+	}
+	break;
+      } else {
+	syslog(LOG_ERR,"-t flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 'a':
+      if (optarg) {
+	syslog(LOG_INFO, "read calib file %s",optarg);
+	if (sscanf (optarg, "%s", &fcalib) != 1) {
+	  syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
+	  return EXIT_FAILURE;
+	}
+	break;
+      }
+      else {
+	syslog(LOG_ERR,"-a flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 'f':
+      if (optarg) {
+	syslog(LOG_INFO, "reading flag ants file %s",optarg);
+	if (sscanf (optarg, "%s", &fflagants) != 1) {
+	  syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
+	  return EXIT_FAILURE;
+	}
+	break;
+      } else
+	{
+	  syslog(LOG_ERR,"-f flag requires argument");
+	  usage();
+	  return EXIT_FAILURE;
+	}
+    case 's':
+      if (optarg) {
+	sfreq = atof(optarg);
+	syslog(LOG_INFO, "start freq %g",sfreq);
+	break;
+      }
+      else {
+	syslog(LOG_ERR,"-s flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
+    case 'd':
+      syslog (LOG_DEBUG, "Will excrete all debug messages");
+      break;
+    case 'h':
+      usage();
+      return EXIT_SUCCESS;
+    }
+  }
+  
+  std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl;
+  std::cout << "NCHAN = " << NCHAN << std::endl;
+  std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl;
+  std::cout << "NPOL = " << NPOL << std::endl;
+  std::cout << "NARM = " << 2 << std::endl;
+  unsigned long long size = sizeof(char);
+  size *= NPACKETS_PER_BLOCK;
+  size *= NANTS;
+  size *= NCHAN_PER_PACKET;
+  size *= NPOL;
+  size *= NCOMPLEX;
+  std::cout << "(bytes) char size * NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX = " << size << std::endl;
+  std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
+  std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
+  
+#if 0
+  dsaX_init();
+  
+  // allocate device memory
+  dmem d;
+  initialize_device_memory(&d, bf);
+
+  FILE *fin, *fout;
+  uint64_t output_size;
+  char * output_data;
+
+  // read one block of input data    
+  d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  for (int i=0;i<512;i++) {
+    fin = fopen(ftest,"rb");
+    fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
+    fclose(fin);
+  }
+  
+  // run correlator or beamformer, and output data
+  syslog(LOG_INFO,"run correlator");
+  dcorrelator(&d);
+  syslog(LOG_INFO,"copy to host");
+  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+  output_data = (char *)malloc(output_size);
+  cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+  
+  fout = fopen("output.dat","wb");
+  fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+  fclose(fout);
+  
+  // free
+  free(d.h_input);
+  free(output_data);
+  //free(o1);
+  deallocate_device_memory(&d,bf);
+  dsaX_end();
+  
+  return 0;
+#endif
+}
+
diff --git a/utils/gen_packet.py b/utils/gen_packet.py
new file mode 100644
index 0000000..7ae8ab4
--- /dev/null
+++ b/utils/gen_packet.py
@@ -0,0 +1,228 @@
+import numpy as np, struct
+import matplotlib.pyplot as plt
+
+
+''' The aim here is to make two types of data packets: 
+ - one with a tone at a particular frequency and set of antennas
+ - one with pure noise 
+
+Structure is 3 ant, 384 chan, 2 time, 2 pol, r/i
+4608 bytes long
+
+'''
+
+
+def make_spectrum(packet,ant=0,pol=0):
+
+    spec = np.zeros(384*2)
+    
+    d = np.asarray(struct.unpack('>4608B',packet))
+
+    # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
+    d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
+
+    d_r = ((d & 15) << 4)
+    d_i = d & 240
+    d_r = d_r.astype(np.int8)/16
+    d_i = d_i.astype(np.int8)/16     
+        
+    spec += d_r**2.+d_i**2.
+    spec = spec.reshape((384,2)).mean(axis=1)
+    return(spec)
+
+def plot_spectrum(data,ant=0,pol=0):
+
+    spec = make_spectrum(data,ant=ant,pol=pol)
+    plt.plot(spec)
+    plt.xlabel('Channel')
+    plt.ylabel('Power')
+    plt.show()
+
+def make_histogram(packet):
+    ''' Makes histogram of packet - tested 
+    '''
+    
+    histo = np.zeros(16)
+    rms = 0.
+                
+    d = np.asarray(struct.unpack('>4608B',packet))
+    
+    # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
+    d = (d.reshape((3,384,2,2))).ravel()
+    
+    d_r = ((d & 15) << 4)
+    d_i = d & 240
+    d_r = d_r.astype(np.int8)/16
+    d_i = d_i.astype(np.int8)/16        
+    
+    rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.)
+
+    hx = np.arange(16)-8
+    
+    for i in range(384*2):
+        
+        histo[int(d_r[i])+8] += 1.
+        histo[int(d_i[i])+8] += 1.
+            
+    return(hx,histo/np.max(histo),np.sqrt(rms))
+
+def histo_test(data):
+
+    hx,histo,rms = make_histogram(data)
+    print('HISTOGRAM: ')
+    for i in range(16):
+        print(hx[i],histo[i])
+    print()
+    print('RMS = ',rms)
+    print()
+
+
+########## MAIN ############
+
+# defaults
+outfile = 'packet.out'
+n_packet = 4608 # 4608 for single packet
+
+# decide which sort of packet to make
+noise = False
+tone = True
+x16 = False
+
+# if tone
+if tone is True:
+
+    # defaults:
+    chans = np.arange(384)#np.asarray([10,100,190])
+    #ant = 1
+    amp_A = 9.0
+    amp_B = 4.
+
+    # derived quantities
+    amp_A = 16.*np.sqrt(amp_A)
+    amp_B = 16.*np.sqrt(amp_B)
+    ph = 2.*np.pi*np.random.uniform()
+    ramp_A = amp_A*np.cos(ph)
+    iamp_A = amp_A*np.sin(ph)
+    ph = 2.*np.pi*np.random.uniform()
+    ramp_B = amp_B*np.cos(ph)
+    iamp_B = amp_B*np.sin(ph)
+    
+    # make packet
+    real_part = np.zeros(n_packet,dtype='int8')
+    imag_part = np.zeros(n_packet,dtype='int8')
+    for ant in [0,1,2]: # 3 antennae
+        for i in chans: # 384 channels
+
+            # time 1 pol A
+            j = int(1536*ant + i*4)
+            real_part[j] = round(ramp_A)
+            imag_part[j] = round(iamp_A)
+            
+            # time 1 pol B
+            j = int(1536*ant + i*4 + 1)
+            real_part[j] = round(ramp_B)
+            imag_part[j] = round(iamp_B)
+            
+            # time 2 pol A
+            j = int(1536*ant + i*4 + 2)
+            real_part[j] = round(ramp_A)
+            imag_part[j] = round(iamp_A)
+
+            # time 2 pol B
+            j = int(1536*ant + i*4 + 3)
+            real_part[j] = round(ramp_B)
+            imag_part[j] = round(iamp_B)
+
+        
+    # make 4-bit versions
+    real_part = np.cast['uint8'](real_part)
+    imag_part = np.cast['uint8'](imag_part)
+    for i in range(n_packet):
+        real_part[i]  = real_part[i] >> 4
+        imag_part[i]  = (imag_part[i] >> 4) << 4
+
+    # finish packet
+    packet = np.zeros(n_packet,dtype='uint8')
+    for i in range(n_packet):
+        packet[i] = real_part[i] | imag_part[i]
+
+    # if x16
+    if (x16):
+
+        p2 = np.zeros(21*n_packet,dtype='uint8')
+        for i in range(21):
+            p2[i*n_packet:(i+1)*n_packet] = packet
+    
+        out_str = p2.tobytes()
+
+    else:
+
+        out_str = packet.tobytes()
+    
+# if noise
+if noise is True:
+
+    # defaults
+    rms = 1.5 # 4-bit
+    erms = rms*16.
+
+    # make real and imag parts
+    real_part = np.zeros(n_packet,dtype='int8')
+    imag_part = np.zeros(n_packet,dtype='int8')
+
+    for ant in [0, 1, 2]:
+        for i in np.arange(384):
+
+            # time 1 pol A
+            j = int(1536*ant + i*4)
+            real_part[j] = round(np.random.normal()*erms)
+            imag_part[j] = round(np.random.normal()*erms)
+            
+            # time 1 pol B
+            j = int(1536*ant + i*4 + 1)
+            real_part[j] = round(np.random.normal()*erms)
+            imag_part[j] = round(np.random.normal()*erms)
+            
+            # time 2 pol A
+            j = int(1536*ant + i*4 + 2)
+            real_part[j] = round(np.random.normal()*erms)
+            imag_part[j] = round(np.random.normal()*erms)
+
+            # time 2 pol B
+            j = int(1536*ant + i*4 + 3)
+            real_part[j] = round(np.random.normal()*erms)
+            imag_part[j] = round(np.random.normal()*erms)
+
+    # make 4-bit versions
+    real_part = np.cast['uint8'](real_part)
+    imag_part = np.cast['uint8'](imag_part)
+    for i in range(n_packet):
+        real_part[i]  = real_part[i] >> 4
+        imag_part[i]  = (imag_part[i] >> 4) << 4
+
+    # finish packet
+    packet = np.zeros(n_packet,dtype='uint8')
+    for i in range(n_packet):
+        packet[i] = real_part[i] | imag_part[i]
+
+    out_str = packet.tobytes()
+
+
+newFile = open(outfile, "wb")
+newFile.write(out_str)
+newFile.close()
+
+    
+#plot_spectrum(out_str,pol=1,ant=1)
+
+
+    
+
+
+
+    
+        
+    
+    
+        
+    
diff --git a/utils/gen_testblock.py b/utils/gen_testblock.py
new file mode 100644
index 0000000..b9a3c9e
--- /dev/null
+++ b/utils/gen_testblock.py
@@ -0,0 +1,49 @@
+import numpy as np, struct
+import matplotlib.pyplot as plt
+import os
+
+
+''' The aim here is to make data blocks to test the bfCorr code. 
+
+Structure of a packet is 3 ant, 384 chan, 2 time, 2 pol, r/i
+4608 bytes long
+
+Structure of a block is [2048 packets, 32 channel groups, ...]
+
+We want the real and imagniary parts to be random integers over 
+the range of [-8, 7]
+'''
+
+# defaults
+outfile = 'block.out'
+if os.path.exists(outfile):
+    os.remove(outfile)
+    
+
+num_packets = 4
+n_antennae = 3
+n_chans = 384
+n_changs = 32
+
+# make values in the range vals = [-8, 7]
+# [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
+
+
+for ipacket in np.arange(num_packets):
+
+    print(ipacket)
+    for ichang in np.arange(n_changs):
+
+        packet = np.zeros(num_packets*n_changs, dtype='uint8')
+        for i in np.arange(n_antennae):
+            for j in np.arange(n_chans):
+                for k in np.arange(num_packets):
+
+                    # we now make a randon integer iunt8 format
+                    idx = ichang + n_changs*ipacket
+                    packet[idx] = np.random.randint(0, 256)
+                    
+        out_str = packet.tobytes()        
+        newFile = open(outfile, "ab")
+        newFile.write(out_str)
+        newFile.close()
diff --git a/utils/get_rms.py b/utils/get_rms.py
new file mode 100644
index 0000000..8854a36
--- /dev/null
+++ b/utils/get_rms.py
@@ -0,0 +1,141 @@
+import numpy as np
+import sockets as s
+import struct
+import sys
+import matplotlib.pyplot as plt
+
+# for file writing
+
+def write_bin(data,fl='test.dat'):
+
+        f = open(fl,'w+b')
+        for packet in data:
+                d = bytearray(np.asarray(struct.unpack('>4616B',packet))[8:].astype(np.int8))
+                print(len(d))
+                f.write(d)
+
+        f.close()
+        
+
+# for making histogram of input
+
+def make_histogram(data,ant=0,pol=0):
+
+        histo = np.zeros(16)
+        rms = 0.
+        
+        for packet in data:
+                
+                d = np.asarray(struct.unpack('>4616B',packet))[8:]
+                
+                # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
+                d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
+                
+                d_r = ((d & 15) << 4)
+                d_i = d & 240
+                d_r = d_r.astype(np.int8)/16
+                d_i = d_i.astype(np.int8)/16        
+
+                rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.)
+                
+                for i in range(384*2):
+
+                        histo[int(d_r[i])+8] += 1.
+                        histo[int(d_i[i])+8] += 1.
+                                                                        
+        return histo/np.max(histo),np.sqrt(rms)
+        
+# for making spectrum from data
+def decode_data(data,ant=0,pol=0):
+
+    spec = np.zeros(384*2)
+    
+    for packet in data:
+
+        d = np.asarray(struct.unpack('>4616B',packet))[8:]
+
+        # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
+        d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
+
+        d_r = ((d & 15) << 4)
+        d_i = d & 240
+        d_r = d_r.astype(np.int8)/16
+        d_i = d_i.astype(np.int8)/16     
+        
+        spec += d_r**2.+d_i**2.
+
+    spec = spec.reshape((384,2)).mean(axis=1)
+    return(spec)
+
+# for decoding packets
+def decode_header(data):
+
+    min_s = 10000
+    max_s = 0
+        
+    for packet in data:
+
+        d = np.asarray(struct.unpack('>4616B',packet))
+
+        # packet id
+        p = 0
+        p = p | ((d[4] & 224) >> 5)
+        p = p | (d[3] << 3)
+        p = p | (d[2] << 11)
+        p = p | (d[1] << 19)
+        p = p | (d[0] << 27)
+        
+        # spectrum id
+        sp = 0
+        sp = sp | ((d[4] & 31) << 8)
+        sp = sp | d[5]
+
+        if (sp<min_s):
+                min_s = sp
+        if (sp>max_s):
+                max_s = sp
+    
+        print(p,sp)
+
+    print(min_s,max_s)
+
+# MAIN
+
+n = 10000
+ip = '10.41.0.62'
+port=4011
+data = s.capture(ip=ip,port=port,n=n)
+ant=0
+pol=0
+
+#decode_header(data)
+
+histo,rms = make_histogram(data,ant=ant,pol=pol)
+print()
+print('RMS:',rms/np.sqrt(1.*n))
+for i in np.arange(16):
+    print(histo[i],'  ',)
+
+sys.exit()
+    
+spec = decode_data(data,ant=ant,pol=pol)
+spec = np.sqrt(spec/n/2.)
+print()
+print('Have spectral points',len(spec))
+print()
+#for i in np.arange(len(spec)):
+#    print(spec[i],'  ',)
+
+plt.plot(spec)
+plt.show()
+
+
+
+
+
+    
+
+
+    
+
+    
diff --git a/utils/get_rms_packet.py b/utils/get_rms_packet.py
new file mode 100644
index 0000000..f75d278
--- /dev/null
+++ b/utils/get_rms_packet.py
@@ -0,0 +1,36 @@
+import socket, numpy as np
+from progress.bar import Bar
+import sockets as s
+import struct
+import sys
+import matplotlib.pyplot as plt
+
+# ip as string, port as int, buf as int
+def capture(n=100,ip=None,port=None,buf=4616):
+
+        if ip is None:
+            print('No IP')
+            return()
+
+        if port is None:
+            print('No port')
+            return()
+
+        sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
+        sock.bind((ip,port))
+
+        captured=0
+        packs = []
+        bar = Bar('Capturing '+str(n)+' packets...', max=n)
+        while captured<n:
+            
+            data, addr = sock.recvfrom(buf)
+            packs.append(data)
+            captured += 1
+            bar.next()
+            
+        bar.finish()
+                                                                                                            
+    return(packs)
+
+
diff --git a/utils/sockets.py b/utils/sockets.py
new file mode 100644
index 0000000..aaff3f7
--- /dev/null
+++ b/utils/sockets.py
@@ -0,0 +1,31 @@
+import socket, numpy as np
+
+# ip as string, port as int, buf as int
+def capture(n=100,ip=None,port=None,buf=4616):
+
+    if ip is None:
+        print('No IP')
+        return()
+
+    if port is None:
+        print('No port')
+        return()
+
+    sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
+    sock.bind((ip,port))
+
+    captured=0
+    packs = []
+    while captured<n:
+
+        data, addr = sock.recvfrom(buf)
+        packs.append(data)
+        captured += 1
+
+
+    return(packs)
+
+
+
+        
+        

From f3a7c7c7682e5fdd102ba11a4407bcc1b2df9846 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Tue, 25 Jun 2024 21:45:10 -0700
Subject: [PATCH 20/30] Major code refactorisation

---
 CMakeLists.txt                             |  15 +-
 include/CMakeLists.txt                     |   4 +-
 include/dsaX.h                             | 105 +++---
 include/dsaX_blas_interface.h              |   4 +-
 include/dsaX_cublas_interface.h            |   3 +-
 include/dsaX_cuda_interface.h              |  33 +-
 include/dsaX_cuda_kernels.h                | 260 ++++++++++++++
 include/dsaX_cutlass_interface.h           |   6 +-
 include/dsaX_enums.h                       |  30 +-
 include/dsaX_ftd.h                         |   5 +
 include/dsaX_interface.h                   |  12 +
 include/dsaX_magma_interface.h             |   4 +
 include/dsaX_utils.h                       |   6 +-
 src/11_planar_complex_array.cu             |   1 -
 src/CMakeLists.txt                         |  72 +++-
 src/dsaX_beamformer.cpp                    | 120 +++++++
 src/dsaX_beamformer.cu                     | 168 ---------
 src/dsaX_blas_interface.cpp                |  28 ++
 src/dsaX_blas_interface.cu                 |  11 -
 src/dsaX_correlator.cpp                    |  18 +-
 src/dsaX_cublas_interface.cu               |  60 ++--
 src/dsaX_cuda_interface.cu                 | 318 ++++------------
 src/dsaX_interface.cpp                     |  69 ++++
 src/dsaX_magma_interface.cu                |  23 ++
 src/dsaX_psrdada_utils.cpp                 |  11 +
 src/dsaX_utils.cpp                         |  32 +-
 src/version.cpp                            |   5 +
 tests/CMakeLists.txt                       |   5 +-
 tests/CMakeLists.txt~                      |   5 -
 tests/command_line_params.cpp              |  17 +
 tests/dsaX_beamformer_correlator_test.cpp~ | 398 ---------------------
 tests/dsaX_correlator_test.cpp             |  58 ++-
 32 files changed, 889 insertions(+), 1017 deletions(-)
 create mode 100644 include/dsaX_cuda_kernels.h
 create mode 100644 include/dsaX_ftd.h
 create mode 100644 include/dsaX_interface.h
 create mode 100644 include/dsaX_magma_interface.h
 create mode 100644 src/dsaX_beamformer.cpp
 delete mode 100644 src/dsaX_beamformer.cu
 create mode 100644 src/dsaX_blas_interface.cpp
 delete mode 100644 src/dsaX_blas_interface.cu
 create mode 100644 src/dsaX_interface.cpp
 create mode 100644 src/dsaX_magma_interface.cu
 create mode 100644 src/dsaX_psrdada_utils.cpp
 create mode 100644 src/version.cpp
 delete mode 100644 tests/CMakeLists.txt~
 create mode 100644 tests/command_line_params.cpp
 delete mode 100644 tests/dsaX_beamformer_correlator_test.cpp~

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 441ae7f..acfd1a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(CMAKE_CXX_EXTENSIONS ON)
 
 # Define the project
-project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES CXX CUDA C)
+project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES C CXX)
 
 # For GCC 8 and lower, set -pthread flag manually
 set(CMAKE_C_FLAGS "-pthread")
@@ -80,6 +80,12 @@ if(GIT_FOUND)
   endif()
 endif(GIT_FOUND)
 
+
+option(DSA_XENGINE_BUILD_ALL_TESTS "build tests by default" ON)
+option(DSA_XENGINE_INSTALL_ALL_TESTS "install tests by default" ON)
+option(DSA_XENGINE_BUILD_SHAREDLIB "build dsaXengine as a shared lib" ON)
+
+
 # Use ExternalProject_Add for libtcc (borks with FetchContent)
 # Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch)
 include(ExternalProject)
@@ -92,7 +98,7 @@ include(FetchContent)
 if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
 
   # CUDA specific part of CMakeLists
-  #set(CMAKE_CUDA_EXTENSIONS OFF)
+  enable_language(CUDA)
   find_package(CUDAToolkit REQUIRED)
 
   # Get GPU architecture from environmen, or set default (sm_80)
@@ -130,6 +136,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
   # Get TCC dependency
   option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF)
   if(DSA_XENGINE_ENABLE_TCC)
+    add_compile_definitions(DSA_XENGINE_ENABLE_TCC)
     option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
     if(DSA_XENGINE_DOWNLOAD_TCC)
       ExternalProject_Add(TCC
@@ -145,6 +152,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
   # Get CUTLASS dependency
   option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF)
   if(DSA_XENGINE_ENABLE_CUTLASS)
+    add_compile_definitions(DSA_XENGINE_ENABLE_CUTLASS)
     option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
     if(DSA_XENGINE_DOWNLOAD_CUTLASS)
       # Custom CUTLASS build
@@ -164,6 +172,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
   # Get MAGMA dependency
   option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF)
   if(DSA_XENGINE_ENABLE_MAGMA)
+    add_compile_definitions(DSA_XENGINE_ENABLE_MAGMA)
     option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF)
     if(DSA_XENGINE_DOWNLOAD_MAGMA)
       # Custom MAGMA build
@@ -182,6 +191,7 @@ if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
   # Get XGPU dependency (fix install)
   option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF)
   if(DSA_XENGINE_ENABLE_XGPU)
+    add_compile_definitions(DSA_XENGINE_ENABLE_XGPU)
     option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF)
     if(DSA_XENGINE_DOWNLOAD_XGPU)
       # Download, build and install
@@ -203,6 +213,7 @@ endif() # CUDA functionality
 # Get OPENBLAS dependency
 option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF)
 if(DSA_XENGINE_ENABLE_OPENBLAS)
+  add_compile_definitions(DSA_XENGINE_ENABLE_OPENBLAS)
   option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF)
   if(DSA_XENGINE_DOWNLOAD_OPENBLAS)
     # Custom OPENBLAS build
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index a056a0f..e8ec2d6 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -4,12 +4,14 @@ enable_language(CUDA)
 #------------------------------
 set(DSA_XENGINE_HEADERS
   # cmake-format: sortable
+  dsaX.h
+  dsaX_def.h
+  dsaX_ftd.h  
   dsaX_cuda_interface.h
   dsaX_cuda_headers.h
   dsaX_capture.h
   dsaX_capture_manythread.h
   dsaX_capture_pcap.h
-  dsaX_def.h
   dsaX_cutlass_interface.h
   )
 install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
diff --git a/include/dsaX.h b/include/dsaX.h
index 7cf23dc..699fe37 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -2,70 +2,34 @@
 
 #include <complex>
 
+#include "dsaX_def.h"
 #include "dsaX_enums.h"
-#include "dsaX_cuda_headers.h"
-
-// required to prevent overflow in corr matrix multiply
-#define halfFac 4
-
-// beam sep
-#define sep 1.0 // arcmin
-
-// define structure that carries around device memory
-typedef struct dmem {
-
-  // initial data and streams
-  char * h_input; // host input pointer
-  char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-  
-  // correlator pointers
-  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK * 2 times]
-  half * d_r, * d_i;
-  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
-  half * d_outr, *d_outi, *d_tx_outr, *d_tx_outi;
-  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
-  float * d_output;
-  
-  // beamformer pointers
-  char * d_big_input;
-  half * d_br, * d_bi;
-  half * weights_r, * weights_i; //weights: [arm, tactp, b]
-  half * d_bigbeam_r, * d_bigbeam_i; //output: [tc, b]
-  unsigned char * d_bigpower; //output: [b, tc]
-  float * d_scf; // scale factor per beam
-  float * d_chscf;
-  float * h_winp;
-  int * flagants, nflags;
-  float * h_freqs, * d_freqs;
-
-  // timing
-  float cp, prep, cubl, outp;
-  
-} dmem;
 
 // Structure that carries BLAS parameters
 typedef struct dsaXBLASParam_s {  
   size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
   
   dsaXBLASType blas_type;    /**< Type of BLAS computation to perfrom */
+
+  dsaXBLASLib blas_lib;      /**< Which BLAS library to use for BLAS ops */
   
   // GEMM params
-  dsaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
-  dsaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
+  dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
+  dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
   int m;                     /**< number of rows of matrix op(A) and C. */
   int n;                     /**< number of columns of matrix op(B) and C. */
   int k;                     /**< number of columns of op(A) and rows of op(B). */
   int lda;                   /**< leading dimension of two-dimensional array used to store the matrix A. */
   int ldb;                   /**< leading dimension of two-dimensional array used to store matrix B. */
   int ldc;                   /**< leading dimension of two-dimensional array used to store matrix C. */
-  int a_offset;              /**< position of the A array from which begin read/write. */
-  int b_offset;              /**< position of the B array from which begin read/write. */
-  int c_offset;              /**< position of the C array from which begin read/write. */
-  int a_stride;              /**< stride of the A array in strided(batched) mode */
-  int b_stride;              /**< stride of the B array in strided(batched) mode */
-  int c_stride;              /**< stride of the C array in strided(batched) mode */
-  std::complex<double> alpha;             /**< scalar used for multiplication. */
-  std::complex<double>  beta;             /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
+  long long int a_offset;    /**< position of the A array from which begin read/write. */
+  long long int b_offset;    /**< position of the B array from which begin read/write. */
+  long long int c_offset;    /**< position of the C array from which begin read/write. */
+  long long int a_stride;    /**< stride of the A array in strided(batched) mode */
+  long long int b_stride;    /**< stride of the B array in strided(batched) mode */
+  long long int c_stride;    /**< stride of the C array in strided(batched) mode */
+  std::complex<double> alpha;     /**< scalar used for multiplication. */
+  std::complex<double>  beta;     /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
   
   // Common params
   int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
@@ -73,3 +37,46 @@ typedef struct dsaXBLASParam_s {
   dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
   
 } dsaXBLASParam;
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
+
+// define structure that carries around device memory pointers
+typedef struct dmem {
+  
+  // initial data and streams
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  
+  // correlator pointers
+  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times]
+  void *d_r, *d_i; //half
+  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
+  void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half
+  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+  float *d_output;
+  
+  // beamformer pointers
+  char *d_big_input;
+  void *d_br, *d_bi; //half
+  void *weights_r, *weights_i; //weights: [arm, tactp, b] //half
+  void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half
+  unsigned char *d_bigpower; //output: [b, tc]
+  float *d_scf; // scale factor per beam
+  float *d_chscf;
+  float *h_winp;
+  int *flagants, nflags;
+  float *h_freqs, *d_freqs;
+
+  // timing
+  float cp, prep, cubl, outp;
+  
+} dmem;
+
+void dsaXCorrelator(void *output_data, void *input_data);
+
+void reorderOutput(dmem *d);
+void reorderInput(dmem *d);
diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h
index 3cf5c4a..49564b5 100644
--- a/include/dsaX_blas_interface.h
+++ b/include/dsaX_blas_interface.h
@@ -1,5 +1,5 @@
 #pragma once
 
-#include "dsaX.h"
+#include "dsaX_interface.h"
 
-void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param);
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h
index 7ad8b31..5aea5ef 100644
--- a/include/dsaX_cublas_interface.h
+++ b/include/dsaX_cublas_interface.h
@@ -1,5 +1,4 @@
 #pragma once
 #include "dsaX.h"
-#include "dsaX_cuda_headers.h"
 
-void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam param);
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
index c8ea8aa..cee1581 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/dsaX_cuda_interface.h
@@ -1,32 +1,35 @@
 #pragma once
 
+#include <vector>
+
 #include "dsaX_def.h"
+#include "dsaX_enums.h"
 #include "dsaX.h"
 
-#ifdef DSA_XENGINE_TARGET_CUDA
-void initialize_device_memory(dmem *d, int bf);
 
-void deallocate_device_memory(dmem *d, int bf);
 
-void reorder_output_device(dmem *d);
+void initializeCudaMemory(dmem *d, int bf);
+
+void deallocateCudaMemory(dmem *d, int bf);
+
+void dsaXmemsetCuda(void *array, int ch, size_t n);
 
-__global__ void corr_input_copy(char *input, half *inr, half *ini);
+void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind);
 
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec *idata, out_prec *odata);
+void dsaXDeviceSynchronizeCuda();
 
-void reorder_input_device(char *input, char *tx, half *inr, half *ini);
+void reorderOutputCuda(dmem *d);
 
-__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup);
+void calcWeightsCuda(dmem *d);
 
-__global__ void transpose_input_bf(double *idata, double *odata);
+void reorderInputCuda(dmem *d);
 
-__global__ void populate_weights_matrix(float *antpos_e, float *antpos_n, float *calibs, half *wr, half *wi, float *fqs);
+template <typename in_prec, typename out_prec> void transposeMatrixCuda(in_prec *idata, out_prec *odata);
 
-void calc_weights(dmem *d);
+void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int> &dim_block_in, std::vector<int> &dim_grid_in);
 
-__global__ void fluff_input_bf(char *input, half *dr, half *di);
+void transposeScaleBeamformerCuda(void *real, void *imag, unsigned char *output, std::vector<int> &dim_block_in, std::vector<int> &dim_grid_in);
 
-__global__ void transpose_scale_bf(half *ir, half *ii, unsigned char *odata);
+void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb);
 
-__global__ void sum_beam(unsigned char *input, float *output);
-#endif
+void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb);
diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h
new file mode 100644
index 0000000..db09baa
--- /dev/null
+++ b/include/dsaX_cuda_kernels.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include "dsaX_cuda_headers.h"
+
+// KERNELS
+// DMH: Abstract hardcoded launch parameters
+__global__ void transpose_input_beamformer(double *idata, double *odata) {
+  
+  __shared__ double tile[16][17][4];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
+    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
+    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
+    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
+    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
+    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
+    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
+  }
+}
+
+// kernel to help with reordering output
+// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
+// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
+  
+  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+  
+  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
+  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
+  int ch = (int)(chpol / 2);
+  int base_idx = indices_lookup[baseline];
+  int iidx = base_idx * NCHAN_PER_PACKET + ch;
+  int pol = (int)(chpol % 2);
+
+  float v1=0., v2=0.;
+
+  // Use CUDA casting intrinsic __half2float
+  for (int i=0;i<halfFac;i++) {
+    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
+    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
+  }
+
+  output[2*idx] = v1;
+  output[2*idx+1] = v2;
+  
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
+  
+  __shared__ in_prec tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8)
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+
+}
+
+// kernel to fluff input
+// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
+__global__ void corr_input_copy(char *input, half *inr, half *ini) {
+
+  int bidx = blockIdx.x;  // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
+  int tidx = threadIdx.x; // assume 128 threads per block
+  int iidx = bidx*128+tidx;
+
+  // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+  // to get real part 4 bit data.
+  // 0000rrrr
+  // Bit shift this result by 4 to the left.
+  // rrrr0000
+  // Cast to signed char.
+  // +-rrr0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000rrr
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  inr[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(15)  ) << 4) >> 4));
+
+  // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+  // to get imag part 4 bit data
+  // iiii0000.
+  // Cast to signed char
+  // +-iii0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000iii
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
+
+  // Both results should be half (FP16) integers between -8 and 7.
+  half re = inr[iidx];
+  half im = ini[iidx];
+  half lim = 2.;
+  if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+    //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+  }
+}
+
+// kernel to populate an instance of weights matrix
+// [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
+// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+  
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  int inidx = bidx*128+tidx;  
+  
+  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
+  
+  // get indices
+  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
+  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
+  int bm = (int)(idx / (128*(NANTS/2)));
+  int tactp = (int)(idx % (128*(NANTS/2)));
+  //int t = (int)(tactp / (32*(NANTS/2)));
+  int actp = (int)(tactp % (32*(NANTS/2)));
+  int a = (int)(actp / 32);
+  int ctp = (int)(actp % 32);
+  //int c = (int)(ctp / 4);
+  int tp = (int)(ctp % 4);
+  //int t2 = (int)(tp / 2);
+  int pol = (int)(tp % 2);
+  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
+  
+  // calculate weights
+  float theta, afac, twr, twi;
+  if (iArm==0) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_e[a+48*iArm]);
+    twi = sin(afac*antpos_e[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+  if (iArm==1) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_n[a+48*iArm]);
+    twi = sin(afac*antpos_n[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+__global__ void fluff_input_beamformer(char * input, half * dr, half * di) {
+  
+  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = bidx*128+tidx;
+
+  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+
+  // Both results should be half (FP16) integers between -8 and 7.
+  //half re = dr[idx];
+  //half im = di[idx];
+  //half lim = 0;
+  //if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+  //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+  //}
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+__global__ void transpose_scale_beamformer(half * ir, half * ii, unsigned char * odata) {
+
+  __shared__ float tile[16][17];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+  float dr, di;
+
+  for (int j = 0; j < 16; j += 8) {
+    dr = (float)(ir[(y+j)*width + x]);
+    di = (float)(ii[(y+j)*width + x]);
+    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8)
+    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+__global__ void sum_beam(unsigned char *input, float *output) {
+  
+  __shared__ float summ[512];
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  //int idx = bidx*256+tidx;
+  int bm = (int)(bidx/48);
+  int ch = (int)(bidx % 48);
+
+  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
+
+  __syncthreads();
+
+  if (tidx<256) {
+    summ[tidx] += summ[tidx+256];
+    summ[tidx] += summ[tidx+128];
+    summ[tidx] += summ[tidx+64];
+    summ[tidx] += summ[tidx+32];
+    summ[tidx] += summ[tidx+16];
+    summ[tidx] += summ[tidx+8];
+    summ[tidx] += summ[tidx+4];
+    summ[tidx] += summ[tidx+2];
+    summ[tidx] += summ[tidx+1];
+  }
+
+  if (tidx==0) output[bidx] = summ[tidx];  
+}
diff --git a/include/dsaX_cutlass_interface.h b/include/dsaX_cutlass_interface.h
index 5aa753e..f95eeaa 100644
--- a/include/dsaX_cutlass_interface.h
+++ b/include/dsaX_cutlass_interface.h
@@ -48,11 +48,11 @@ struct Options {
   Options():
     help(false),
     problem_size({1024, 1024, 1024}),
-    batch_count(1),
+    batch_count(256),
     reference_check(false),
-    iterations(20),
+    iterations(2),
     alpha(1),
-    beta() { }
+    beta(0) { }
 
   // Parses the command line
   void parse(int argc, char const **args) {
diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h
index b188019..30fe3c6 100644
--- a/include/dsaX_enums.h
+++ b/include/dsaX_enums.h
@@ -2,21 +2,35 @@
 
 #define DSA_INVALID_ENUM (-0x7fffffff - 1)
 
-typedef enum dsaError_t { DSA_SUCCESS = 0, DSA_ERROR = 1, DSA_ERROR_UNINITIALIZED = 2 } dsaError_t;
+typedef enum dsaXError_t {
+  DSA_SUCCESS = 0,
+  DSA_ERROR = 1,
+  DSA_ERROR_UNINITIALIZED = 2,
+  DSA_ERROR_INVALID = DSA_INVALID_ENUM
+} dsaXError;
 
-typedef enum dsaBLASOperation_s {				 
+typedef enum dsaXBLASOperation_s {				 
   DSA_BLAS_OP_N = 0, // No transpose
   DSA_BLAS_OP_T = 1, // Transpose only
   DSA_BLAS_OP_C = 2, // Conjugate transpose
   DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM
-} dsaBLASOperation;
+} dsaXBLASOperation;
 
 typedef enum dsaXBLASType_s {
   DSA_BLAS_GEMM = 0,
   DSA_BLAS_INVALID = DSA_INVALID_ENUM
 } dsaXBLASType;
 
-typedef enum dsaXBLASDataType_s {
+typedef enum dsaXBLASLib_s {
+  DSA_BLAS_LIB_CUBLAS = 0,
+  DSA_BLAS_LIB_MAGMA  = 1,
+  DSA_BLAS_LIB_CUTLASS = 2,
+  DSA_BLAS_LIB_TCC = 3, 
+  DSA_BLAS_LIB_OPENBLAS = 4, 
+  DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM  
+} dsaXBLASLib;
+
+typedef enum dsaXBLASDataLib_s {
   DSA_BLAS_DATATYPE_H = 0, // Half
   DSA_BLAS_DATATYPE_S = 1, // Single
   DSA_BLAS_DATATYPE_D = 2, // Double
@@ -31,3 +45,11 @@ typedef enum dsaXBLASDataOrder_s {
   DSA_BLAS_DATAORDER_COL = 1,
   DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM
 } dsaXBLASDataOrder;
+
+typedef enum dsaXMemcpyKind_s {
+  dsaXMemcpyHostToHost = 0,
+  dsaXMemcpyHostToDevice = 1,
+  dsaXMemcpyDeviceToHost = 2,
+  dsaXMemcpyDeviceToDevice = 3,
+  dsaXMemcpyInvalid = DSA_INVALID_ENUM
+} dsaXMemcpyKind;
diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h
new file mode 100644
index 0000000..f7363f1
--- /dev/null
+++ b/include/dsaX_ftd.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "dsaX.h"
+
+void dcorrelator(dmem *d);
diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h
new file mode 100644
index 0000000..06a2364
--- /dev/null
+++ b/include/dsaX_interface.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <vector>
+
+// DMH: decorate these with Doxygen
+void dsaXCorrelator(void *input_data, void *output_data);
+void reorderInput(dmem *d);
+void reorderOutput(dmem *d);
+void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb);
+void sumBeam(unsigned char *input, float *output, int blocks, int tpb);
diff --git a/include/dsaX_magma_interface.h b/include/dsaX_magma_interface.h
new file mode 100644
index 0000000..12f0cc7
--- /dev/null
+++ b/include/dsaX_magma_interface.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "dsaX.h"
+
+void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h
index 5d39861..f2dbc0c 100644
--- a/include/dsaX_utils.h
+++ b/include/dsaX_utils.h
@@ -3,7 +3,5 @@
 #include "dsaX.h"
 
 void dsaXmemset(void *array, int ch, size_t n);
-
-void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n);
-void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n);
-void dsaXmemcpyDeviceToDevice(void *array_device_to, void *array_device_from, size_t n);
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind);
+void dsaXDeviceSynchronize();
diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu
index ba94b60..94dcc55 100644
--- a/src/11_planar_complex_array.cu
+++ b/src/11_planar_complex_array.cu
@@ -302,7 +302,6 @@ public:
     typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
     typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
 
-
     int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
     int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
     int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c73743a..aaacfa5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,47 +1,87 @@
-enable_language(CUDA)
+include_directories(${CMAKE_SOURCE_DIR}/include)
 
-include_directories(../include)
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+  add_compile_definitions(DSA_XENGINE_TARGET_CUDA)
+endif()
+
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU)
+  add_compile_definitions(DSA_XENGINE_TARGET_CPU)
+endif()
 
 # DSA Fast Time Domain library
 #-----------------------------
-add_library(dsa SHARED
+set(DSAX_OBJS
   dsaX_cuda_interface.cu
   dsaX_cublas_interface.cu
-  dsaX_blas_interface.cu
-  dsaX_beamformer.cu
+  dsaX_magma_interface.cu
+  dsaX_blas_interface.cpp
+  dsaX_beamformer.cpp
   dsaX_correlator.cpp
+  dsaX_interface.cpp
   dsaX_utils.cpp
   dsaX_psrdada_utils.cpp
   )
 
-if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
-  add_compile_definitions(DSA_XENGINE_TARGET_CUDA)
+# split source into cu and cpp files
+foreach(item ${DSAX_OBJS})
+  string(REGEX MATCH ".+\\.cu$" item_match ${item})
+  if(item_match)
+    list(APPEND DSAX_CU_OBJS ${item})
+  endif(item_match)
+endforeach(item ${DSAX_OBJS})
+
+list(REMOVE_ITEM DSAX_OBJS ${DSAX_CU_OBJS})
+
+# DSAX_CU_OBJS should contain all cuda files now and DSAX_OBJS all cpp.
+# If we have a git version, make version.cpp depend on git head so that it is
+# rebuilt if the git sha changed
+if(GITVERSION)
+  find_path(
+    DSAX_GITDIR NAME HEAD
+    PATHS ${CMAKE_SOURCE_DIR}/.git/logs
+    NO_DEFAULT_PATH)
+  include(AddFileDependencies)
+  if(DSAX_GITDIR)
+    add_file_dependencies(version.cpp ${DSAX_GITDIR}/HEAD)
+  endif()
 endif()
-if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU)
-  add_compile_definitions(DSA_XENGINE_TARGET_CPU)
+mark_as_advanced(DSAX_GITDIR)
+
+# generate a cmake object library for all cpp files first                                                                                                                                                           
+add_library(dsax_cpp OBJECT ${DSAX_OBJS})
+
+if(DSA_XENGINE_BUILD_SHAREDLIB)
+  set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  add_library(dsax SHARED)
+else()
+  add_library(dsax STATIC)
 endif()
+add_library(DSA_XENGINE::dsax ALIAS dsax)
+
+# make one library                                                                                                                                                                                                  
+target_sources(dsax PRIVATE $<TARGET_OBJECTS:dsax_cpp> ${DSAX_CU_OBJS})
 
 if(CUDAToolkit_FOUND)
-  target_link_libraries(dsa PUBLIC CUDA::cudart)
+  target_link_libraries(dsax INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY})
 endif()
 
 if(DSA_XENGINE_ENABLE_PSRDADA)
   include_directories(${PSRDada_SOURCE_DIR}/src)
   set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-  target_link_libraries(dsa PUBLIC ${PSRDada_LIB})
+  target_link_libraries(dsax PUBLIC ${PSRDada_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_XGPU) 
   include_directories(${xGPU_SOURCE_DIR}/src)
   set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
-  target_link_libraries(dsa PUBLIC ${XGPU_LIB})
+  target_link_libraries(dsax PUBLIC ${XGPU_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_CUTLASS) 
   include_directories(${NvidiaCutlass_DIR}/../../../include)
   include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
   set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
-  target_link_libraries(dsa PUBLIC ${NvidiaCutlass_LIB})
+  target_link_libraries(dsax PUBLIC ${NvidiaCutlass_LIB})
   
   # Some simple CUTLASS examples to test linking/benching
   #------------------------------------------------------
@@ -67,7 +107,7 @@ endif()
 #-----------------------------
 install(TARGETS
   # cmake-format: sortable
-  dsa
+  dsax
   LIBRARY DESTINATION
   lib
   )
@@ -84,6 +124,6 @@ install(TARGETS
 #-----------------------------
 
 if(CUDAToolkit_FOUND)
-  add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu)
-  target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsa ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+  #add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu)
+  #target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsax ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
 endif()
diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp
new file mode 100644
index 0000000..f395b0e
--- /dev/null
+++ b/src/dsaX_beamformer.cpp
@@ -0,0 +1,120 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+
+#include <iostream>
+#include <vector>
+
+#include "dsaX_def.h"
+#include "dsaX.h"
+#include "dsaX_blas_interface.h"
+#include "dsaX_utils.h"
+#include "dsaX_psrdada_utils.h"
+
+using namespace std;
+
+/*
+Beamformer:
+ - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
+ - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+ - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
+(single transpose operation)
+ - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
+ - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
+ - transpose and done! 
+
+*/
+// beamformer function
+void dbeamformer(dmem *d) {
+
+  dsaXBLASParam blas_param;
+  blas_param.trans_a = DSA_BLAS_OP_T;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NPACKETS_PER_BLOCK/4;
+  blas_param.n = NBEAMS/2;
+  blas_param.k = 4*(NANTS/2)*8*2*2;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.k;
+  blas_param.ldb = blas_param.k;
+  blas_param.beta = 0.0;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
+  blas_param.b_stride = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
+  blas_param.c_stride = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
+  blas_param.batch_count = NCHAN_PER_PACKET/8;
+  
+  long long int i1, i2;
+  
+  // timing
+  // copy, prepare, cublas, output
+  clock_t begin, end;
+
+  // do big memcpy
+  begin = clock();
+  dsaXmemcpy(d->d_big_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4, dsaXMemcpyHostToDevice);
+  end = clock();
+  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+  
+  // loop over halves of the array
+  for (int iArm=0;iArm<2;iArm++) {
+  
+    // zero out output arrays
+    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short));
+    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short));
+    dsaXDeviceSynchronize();
+    
+    // copy data to device
+    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+    // final data: need to split by NANTS.
+    begin = clock();
+    for (i1=0; i1<NPACKETS_PER_BLOCK; i1++) 
+      dsaXmemcpy(d->d_input + i1*(NANTS/2)*NCHAN_PER_PACKET*4,
+		 d->d_big_input + i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,
+		 (NANTS/2)*NCHAN_PER_PACKET*4, dsaXMemcpyDeviceToDevice);
+    end = clock();
+    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // do reorder and fluff of data to real and imag
+    begin = clock();
+
+    // DMH: Abstract the launch parameters
+    std::vector<int> dimBlock = {16, 8};
+    std::vector<int> dimGrid = {NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16};
+    transposeInputBeamformer((double *)(d->d_input), (double *)(d->d_tx), dimBlock, dimGrid);
+
+    int blocks = NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128;
+    int tpb = 128;
+    fluffInputBeamformer(d->d_tx, d->d_br, d->d_bi, blocks, tpb);    
+    end = clock();
+    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // set up for gemm    
+    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
+    blas_param.b_offset = i2;
+    // large matrix multiply to get real and imag outputs
+    begin = clock();
+    dsaXHgemmStridedBatched(d->d_br, d->d_bi, d->weights_r, d->weights_i, d->d_bigbeam_r, d->d_bigbeam_i, blas_param);
+    end = clock();
+    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
+        
+    // simple formation of total power and scaling to 8-bit in transpose kernel
+    // Reuse dimBlock
+    //DMH: Abstract kernel launch parameters
+    dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16;
+    dimGrid[0] = (NCHAN_PER_PACKET/8)/16;
+    begin = clock();
+    transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid);
+    end = clock();
+    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
+  }
+
+  // form sum over times
+  int blocks = 24576;
+  int tpb = 512;
+  // COMMENT OUT WHEN DONE!!!
+  //sumBeam(d->d_bigpower, d->d_chscf, blocks, tpb);
+}
diff --git a/src/dsaX_beamformer.cu b/src/dsaX_beamformer.cu
deleted file mode 100644
index 0d7b1df..0000000
--- a/src/dsaX_beamformer.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-// -*- c++ -*-
-/* assumes input and output block size is appropriate - will seg fault otherwise*/
-/*
-Workflow is similar for BF and corr applications
- - copy data to GPU, convert to half-precision and calibrate while reordering
- - do matrix operations to populate large output vector
- */
-
-#include <iostream>
-
-#include "dsaX_def.h"
-#include "dsaX.h"
-#include "dsaX_blas_interface.h"
-#include "dsaX_utils.h"
-#include "dsaX_psrdada_utils.h"
-#ifdef DSA_XENGINE_TARGET_CUDA
-#include "dsaX_cuda_interface.h"
-#endif
-
-using namespace std;
-
-int DEBUG = 1;
-
-void usage() {
-  fprintf (stdout,
-	   "dsaX_beamformer_correlator [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default REORDER_BLOCK_KEY]\n"
-	   " -o out_key [default XGPU_BLOCK_KEY]\n"
-	   " -b run beamformer [default is to run correlator]\n"
-	   " -h print usage\n"
-	   " -t binary file for test mode\n"
-	   " -f flagants file\n"
-	   " -a calib file\n"
-	   " -s start frequency (assumes -0.244140625MHz BW)\n");
-}
-
-
-/*
-Beamformer:
- - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
- - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
- - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
-(single transpose operation)
- - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
- - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
- - transpose and done! 
-
-*/
-// beamformer function
-void dbeamformer(dmem *d) {
-
-  // gemm settings - recall column major order assumed
-  // stride over 48 chans
-  cublasHandle_t cublasH = NULL;
-  cublasCreate(&cublasH);
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  const int m = NPACKETS_PER_BLOCK/4;
-  const int n = NBEAMS/2;
-  const int k = 4*(NANTS/2)*8*2*2;
-  const half alpha = 1.;
-  const half malpha = -1.;
-  const int lda = k;
-  const int ldb = k;
-  const half beta0 = 0.;
-  const half beta1 = 1.;
-  const int ldc = m;
-  const long long int strideA = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
-  const long long int strideB = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
-  const long long int strideC = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
-  const int batchCount = NCHAN_PER_PACKET/8;
-  long long int i1, i2;//, o1;
-  
-  // create streams
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  // timing
-  // copy, prepare, cublas, output
-  clock_t begin, end;
-
-  // do big memcpy
-  begin = clock();
-  dsaXmemcpyHostToDevice(d->d_big_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4);
-  end = clock();
-  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
-  
-  // loop over halves of the array
-  for (int iArm=0;iArm<2;iArm++) {
-  
-    // zero out output arrays
-    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
-    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(half));
-    cudaDeviceSynchronize();
-    
-    // copy data to device
-    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-    // final data: need to split by NANTS.
-    begin = clock();
-    for (i1=0; i1<NPACKETS_PER_BLOCK; i1++) 
-      dsaXmemcpyDeviceToDevice(d->d_input+i1*(NANTS/2)*NCHAN_PER_PACKET*4,
-			       d->d_big_input+i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,
-			       (NANTS/2)*NCHAN_PER_PACKET*4);
-    end = clock();
-    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
-    
-    // do reorder and fluff of data to real and imag
-    begin = clock();
-    
-    dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);    
-    transpose_input_bf<<< dimGrid1, dimBlock1 >>>((double *)(d->d_input), (double *)(d->d_tx));    
-    fluff_input_bf<<<NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128,128>>>(d->d_tx, d->d_br, d->d_bi);
-    
-    end = clock();
-    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
-
-    // large matrix multiply to get real and imag outputs
-    // set up for gemm
-    cublasSetStream(cublasH, stream);
-    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
-    
-    // run strided batched gemm
-    begin = clock();
-    // ac
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_br,lda,strideA,
-			      d->weights_r+i2,ldb,strideB,&beta0,
-			      d->d_bigbeam_r,ldc,strideC,
-			      batchCount);
-    // -bd
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &malpha,d->d_bi,lda,strideA,
-			      d->weights_i+i2,ldb,strideB,&beta1,
-			      d->d_bigbeam_r,ldc,strideC,
-			      batchCount);
-    // bc
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_bi,lda,strideA,
-			      d->weights_r+i2,ldb,strideB,&beta0,
-			      d->d_bigbeam_i,ldc,strideC,
-			      batchCount);
-    // ad
-    cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			      &alpha,d->d_br,lda,strideA,
-			      d->weights_i+i2,ldb,strideB,&beta1,
-			      d->d_bigbeam_i,ldc,strideC,
-			      batchCount);
-      
-    cudaDeviceSynchronize();
-    end = clock();
-    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
-        
-    // simple formation of total power and scaling to 8-bit in transpose kernel
-    begin = clock();
-    dim3 dimBlock(16, 8), dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16);
-    transpose_scale_bf<<<dimGrid,dimBlock>>>(d->d_bigbeam_r,d->d_bigbeam_i,d->d_bigpower+iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    end = clock();
-    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
-  }
-
-  cudaStreamDestroy(stream);
-  cublasDestroy(cublasH);
-
-  // form sum over times
-  //sum_beam<<<24576,512>>>(d->d_bigpower,d->d_chscf);
-}
diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp
new file mode 100644
index 0000000..e370e87
--- /dev/null
+++ b/src/dsaX_blas_interface.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+
+#include "dsaX.h"
+#include "dsaX_cublas_interface.h"
+#include "dsaX_magma_interface.h"
+
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param) {
+  switch (param.blas_lib) {
+  case DSA_BLAS_LIB_CUBLAS:
+    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_MAGMA:
+    dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_CUTLASS:
+    //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_OPENBLAS:
+    //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_TCC:
+    //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  default:
+    std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl;
+    exit(0);
+  }
+}
diff --git a/src/dsaX_blas_interface.cu b/src/dsaX_blas_interface.cu
deleted file mode 100644
index 7e49fcb..0000000
--- a/src/dsaX_blas_interface.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <dsaX.h>
-#include "dsaX_cublas_interface.h"
-
-void dsaXHgemmStridedBatched(void *real_in, void *imag_in, void *real_out, void *imag_out, dsaXBLASParam param) {
-#ifdef DSA_XENGINE_TARGET_CUDA
-  dsaXHgemmStridedBatchedCuda((half*)real_in, (half*)imag_in, (half*)real_out, (half*)imag_out, param);
-#else
-  std::cout "Not implemented" << std::endl;
-  exit(0);
-#endif
-}
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
index d2223f5..4611939 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/dsaX_correlator.cpp
@@ -13,23 +13,22 @@ Workflow is similar for BF and corr applications
 #include "dsaX_blas_interface.h"
 #include "dsaX_utils.h"
 #include "dsaX_psrdada_utils.h"
-#include "dsaX_cuda_interface.h"
 
 // correlator function
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
-// DMH CUDA references excised
+// DMH CUDA references excised.
 void dcorrelator(dmem *d) {
 
   // copy to device
-  dsaXmemcpyHostToDevice(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
   
   // zero out output arrays
-  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
-  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
+  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
+  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
   dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
   
-  // reorder input
-  reorder_input_device(d->d_input, d->d_tx, d->d_r, d->d_i);
+  // reorder input into real and imaginary arrays of 2 byte data
+  reorderInput(d);
 
   dsaXBLASParam blas_param;
   // gemm settings
@@ -51,9 +50,8 @@ void dcorrelator(dmem *d) {
   blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
 
   // Perform GEMM accoring to back end configuration
-  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
+  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
   
   // reorder output data
-  reorder_output_device(d);
-
+  reorderOutput(d);
 }
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
index df6b3de..17a2c9b 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/dsaX_cublas_interface.cu
@@ -1,9 +1,10 @@
 #include <iostream>
-#include "dsaX_cublas_interface.h"
+#include "dsaX.h"
+#include "dsaX_cuda_headers.h"
 
 using namespace std;
 
-void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, half *imag_out, dsaXBLASParam blas_param) {
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) {
 #ifdef DSA_XENGINE_TARGET_CUDA
   
   // not sure if essential
@@ -45,51 +46,60 @@ void dsaXHgemmStridedBatchedCuda(half *real_in, half *imag_in, half *real_out, h
   const int n = blas_param.n;
   const int k = blas_param.k;
   const half alpha = blas_param.alpha.real();
-  const half malpha = -1.0 * blas_param.alpha.real();
+  const half malpha = (-1.0 * blas_param.alpha.real());
   const int lda = blas_param.lda;
   const int ldb = blas_param.ldb;
   const half beta0 = blas_param.beta.real();
   const half beta1 = 1.0;
   const int ldc = blas_param.ldc;
+  const long long int a_offset = blas_param.a_offset;
+  const long long int b_offset = blas_param.b_offset;
+  const long long int c_offset = blas_param.c_offset;
   const long long int strideA = blas_param.a_stride;
   const long long int strideB = blas_param.b_stride;
   const long long int strideC = blas_param.c_stride;
   const int batchCount = blas_param.batch_count;
   
-  // run strided batched gemm for datatype (a + ib)(c + id)
+  // Run strided batched gemm for datatype 
+  // (a + ib)(c + id) = (ac - bd) + i(bc + ad)
+  // on matrices alpha * op(A) * op(B) + beta * C
+  // where op(M) is defined by the transposition variable
+  // cublasOperation_t transM
+  
+  // Accumulate results into C matrix
   // ac
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,real_in,lda,strideA,
-			    real_in,ldb,strideB,&beta0,
-			    real_out,ldc,strideC,
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+			    (half *)real_a + a_offset, lda, strideA,
+			    (half *)real_b + b_offset, ldb, strideB, &beta0,
+			    (half *)real_c + c_offset, ldc, strideC,
 			    batchCount);
-  // bd
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,imag_in,lda,strideA,
-			    imag_in,ldb,strideB,&beta1,
-			    real_out,ldc,strideC,
+  // -bd
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &malpha,
+			    (half*)imag_a + a_offset, lda, strideA,
+			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
+			    (half*)real_c + c_offset, ldc, strideC,
 			    batchCount);
-  // -bc
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &malpha,imag_in,lda,strideA,
-			    real_in,ldb,strideB,&beta0,
-			    imag_out,ldc,strideC,
+  // bc
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+			    (half*)imag_a + a_offset, lda, strideA,
+			    (half*)real_b + b_offset, ldb, strideB, &beta0,
+			    (half*)imag_c + c_offset, ldc, strideC,
 			    batchCount);
   // ad
-  cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
-			    &alpha,real_in,lda,strideA,
-			    imag_in,ldb,strideB,&beta1,
-			    imag_out,ldc,strideC,
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+			    (half*)real_a + a_offset, lda, strideA,
+			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
+			    (half*)imag_c + c_offset, ldc, strideC,
 			    batchCount);
-
+  
   // shown to be essential
   cudaDeviceSynchronize();
-
+  
   // destroy stream
   cudaStreamDestroy(stream);
   cublasDestroy(cublasH);  
 #else
-  std::cout "Not implemented" << std::endl;
+  std::cout "dsaX not built with CUDA target." << std::endl;
   exit(0);
 #endif
 }
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index d1f77a4..8eda8ae 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -1,7 +1,14 @@
+#include <iostream>
+#include <vector>
+
+#include "dsaX_cuda_headers.h"
 #include "dsaX_cuda_interface.h"
+#include "dsaX_cuda_kernels.h"
+
+using namespace std;
 
 // allocate device memory
-void initialize_device_memory(dmem *d, int bf) {
+void initializeCudaMemory(dmem *d, int bf) {
   
   // for correlator
   if (bf==0) {
@@ -46,7 +53,7 @@ void initialize_device_memory(dmem *d, int bf) {
   }  
 }
 // deallocate device memory
-void deallocate_device_memory(dmem *d, int bf) {
+void deallocateCudaMemory(dmem *d, int bf) {
   
   cudaFree(d->d_input);
 
@@ -83,13 +90,13 @@ void deallocate_device_memory(dmem *d, int bf) {
 // the corr matrices are column major order
 // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
 // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
-void reorder_output_device(dmem * d) {
+void reorderOutputCuda(dmem * d) {
   
   // transpose input data
-  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32,(NCHAN_PER_PACKET*2*2*halfFac)/32);
-  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outr,d->d_tx_outr);
-  transpose_matrix<<<dimGrid,dimBlock>>>(d->d_outi,d->d_tx_outi);
-
+  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32);
+  transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outr, (half*)d->d_tx_outr);
+  transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outi, (half*)d->d_tx_outi);
+  
   // look at output
   /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
   cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
@@ -140,8 +147,8 @@ void reorder_output_device(dmem * d) {
   cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice);
 
   // run kernel to finish things
-  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>(d->d_tx_outr,d->d_tx_outi,d->d_output,d_idxs);
-
+  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs);
+  
   /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
   cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
   FILE *fout;
@@ -154,70 +161,7 @@ void reorder_output_device(dmem * d) {
   //cudaStreamDestroy(stream);  
 }
 
-// kernel to fluff input
-// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
-__global__ void corr_input_copy(char *input, half *inr, half *ini) {
-
-  int bidx = blockIdx.x;  // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
-  int tidx = threadIdx.x; // assume 128 threads per block
-  int iidx = bidx*128+tidx;
-
-  // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
-  // to get real part 4 bit data.
-  // 0000rrrr
-  // Bit shift this result by 4 to the left.
-  // rrrr0000
-  // Cast to signed char.
-  // +-rrr0000
-  // Bitshift mantisa only to the right by 4 bits
-  // +-0000rrr
-  // Cast to float and use CUDA intrinsic to cast to signed half
-  inr[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(15)  ) << 4) >> 4));
-
-  // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
-  // to get imag part 4 bit data
-  // iiii0000.
-  // Cast to signed char
-  // +-iii0000
-  // Bitshift mantisa only to the right by 4 bits
-  // +-0000iii
-  // Cast to float and use CUDA intrinsic to cast to signed half
-  ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
-
-  // Both results should be half (FP16) integers between -8 and 7.
-  half re = inr[iidx];
-  half im = ini[iidx];
-  half lim = 2.;
-  if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
-    //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
-  }
-}
-
-// transpose kernel
-// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
-// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
-// here, width is the dimension of the fastest index
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
-
-  __shared__ in_prec tile[32][33];
-  
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
 
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
-}
 
 
 // function to copy and reorder d_input to d_r and d_i
@@ -225,42 +169,14 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
 // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
 // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
 // then fluffs using simple kernel
-void reorder_input_device(char *input, char * tx, half *inr, half *ini) {
-
+void reorderInputCuda(dmem *d) {
+  
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix<<<dimGrid,dimBlock>>>(input, tx);
-  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128, 128>>>(tx, inr, ini);
+  transpose_matrix<<<dimGrid, dimBlock>>>(d->d_input, d->d_tx);
+  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128, 128>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
 }
 
-// kernel to help with reordering output
-// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
-// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
-__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
-
-  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
-  int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
-  
-  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
-  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
-  int ch = (int)(chpol / 2);
-  int base_idx = indices_lookup[baseline];
-  int iidx = base_idx * NCHAN_PER_PACKET + ch;
-  int pol = (int)(chpol % 2);
-
-  float v1=0., v2=0.;
-
-  // Use CUDA casting intrinsic __half2float
-  for (int i=0;i<halfFac;i++) {
-    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
-    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
-  }
-
-  output[2*idx] = v1;
-  output[2*idx+1] = v2;
-  
-}
 
 // kernels to reorder and fluff input data for beamformer
 // initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
@@ -269,94 +185,24 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind
 // launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
 // here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
 // dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
-__global__ void transpose_input_bf(double *idata, double *odata) {
-
-  __shared__ double tile[16][17][4];
-  
-  int x = blockIdx.x * 16 + threadIdx.x;
-  int y = blockIdx.y * 16 + threadIdx.y;
-  int width = gridDim.x * 16;
-
-  for (int j = 0; j < 16; j += 8) {
-    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
-    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
-    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
-    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
-  }
-  
-  __syncthreads();
+void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int> &dim_block_in,
+				  std::vector<int> &dim_grid_in) {
 
-  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 16 + threadIdx.y;
-  width = gridDim.y * 16;
-
-  for (int j = 0; j < 16; j += 8) {
-    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
-    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
-    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
-    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
-  }
+  // Create CUDA objects for launch
+  dim3 dim_block(dim_block_in[0], dim_block_in[1]);
+  dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]);
 
+  // Launch kernel
+  transpose_input_beamformer<<<dim_grid, dim_block>>>(idata, odata);
 }
 
-// kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
-// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
-__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
-
-  int bidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  int inidx = bidx*128+tidx;  
-  
-  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
-  
-  // get indices
-  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
-  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
-  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
-  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
-  int bm = (int)(idx / (128*(NANTS/2)));
-  int tactp = (int)(idx % (128*(NANTS/2)));
-  //int t = (int)(tactp / (32*(NANTS/2)));
-  int actp = (int)(tactp % (32*(NANTS/2)));
-  int a = (int)(actp / 32);
-  int ctp = (int)(actp % 32);
-  //int c = (int)(ctp / 4);
-  int tp = (int)(ctp % 4);
-  //int t2 = (int)(tp / 2);
-  int pol = (int)(tp % 2);
-  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
-  
-  // calculate weights
-  float theta, afac, twr, twi;
-  if (iArm==0) {
-    theta = sep*(127.-bm*1.)*PI/10800.; // radians
-    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
-    twr = cos(afac*antpos_e[a+48*iArm]);
-    twi = sin(afac*antpos_e[a+48*iArm]);
-    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
-    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
-    //wr[inidx] = __float2half(calibs[widx]);
-    //wi[inidx] = __float2half(calibs[widx+1]);
-  }
-  if (iArm==1) {
-    theta = sep*(127.-bm*1.)*PI/10800.; // radians
-    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
-    twr = cos(afac*antpos_n[a+48*iArm]);
-    twi = sin(afac*antpos_n[a+48*iArm]);
-    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
-    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
-    //wr[inidx] = __float2half(calibs[widx]);
-    //wi[inidx] = __float2half(calibs[widx+1]);
-  }
-    
-}
 
 // GPU-powered function to populate weights matrix for beamformer
 // file format:
 // sequential pairs of eastings and northings
 // then [NANTS, 48, R/I] calibs
 
-void calc_weights(dmem *d) {
+void calcWeightsCuda(dmem *d) {
 
   // allocate
   float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
@@ -405,7 +251,7 @@ void calc_weights(dmem *d) {
   cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
 
   // run kernel to populate weights matrix
-  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs);  
+  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128, 128>>>(d_antpos_e, d_antpos_n, d_calibs, (half*)d->weights_r, (half*)d->weights_i, d->d_freqs);  
   
   // free stuff
   cudaFree(d_antpos_e);
@@ -419,83 +265,61 @@ void calc_weights(dmem *d) {
 
 // kernel to fluff input bf data
 // run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
-__global__ void fluff_input_bf(char * input, half * dr, half * di) {
-
-  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
-  int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
+void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb) {
 
-  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
-  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
-
-  // Both results should be half (FP16) integers between -8 and 7.
-  //half re = dr[idx];
-  //half im = di[idx];
-  //half lim = 0;
-  //if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
-  //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
-  //}
-
-  
+  // Launch kernel
+  fluff_input_beamformer<<<blocks, tpb>>>(input, (half*)b_real, (half*)b_imag);  
 }
 
 // transpose, add and scale kernel for bf
 // assume breakdown into tiles of 16x16, and run with 16x8 threads per block
 // launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
 // scf is a per-beam scale factor to enable recasting as unsigned char
-__global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata) {
-
-  __shared__ float tile[16][17];
+void transposeScaleBeamformerCuda(void *ir, void *ii, unsigned char *odata, std::vector<int> &dim_block_in,
+				  std::vector<int> &dim_grid_in) {
   
-  int x = blockIdx.x * 16 + threadIdx.x;
-  int y = blockIdx.y * 16 + threadIdx.y;
-  int width = gridDim.x * 16;
-  float dr, di;
-
-  for (int j = 0; j < 16; j += 8) {
-    dr = (float)(ir[(y+j)*width + x]);
-    di = (float)(ii[(y+j)*width + x]);
-    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
-  }
-
-  __syncthreads();
+  // Create CUDA objects for launch
+  dim3 dim_block(dim_block_in[0], dim_block_in[1]);
+  dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]);
+  
+  // Launch kernel
+  transpose_scale_beamformer<<<dim_grid, dim_block>>>((half*)ir, (half*)ii, odata);
+}
 
-  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 16 + threadIdx.y;
-  width = gridDim.y * 16;
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) {
 
-  for (int j = 0; j < 16; j += 8)
-    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+  // Launch kernel
+  sum_beam<<<blocks,tpb>>>(input, output);  
+}
 
+void dsaXmemsetCuda(void *array, int ch, size_t n){
+  cudaMemset(array, ch, n);
 }
 
-// sum over all times in output beam array
-// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
-__global__ void sum_beam(unsigned char * input, float * output) {
-
-  __shared__ float summ[512];
-  int bidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  //int idx = bidx*256+tidx;
-  int bm = (int)(bidx/48);
-  int ch = (int)(bidx % 48);
-
-  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
-
-  __syncthreads();
-
-  if (tidx<256) {
-    summ[tidx] += summ[tidx+256];
-    summ[tidx] += summ[tidx+128];
-    summ[tidx] += summ[tidx+64];
-    summ[tidx] += summ[tidx+32];
-    summ[tidx] += summ[tidx+16];
-    summ[tidx] += summ[tidx+8];
-    summ[tidx] += summ[tidx+4];
-    summ[tidx] += summ[tidx+2];
-    summ[tidx] += summ[tidx+1];
+void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
+  cudaError error = cudaSuccess;
+  switch(kind) {
+  case dsaXMemcpyHostToHost:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost);
+    break;
+  case dsaXMemcpyHostToDevice:
+   error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice);
+   break;
+  case dsaXMemcpyDeviceToHost:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost);
+    break;
+  case dsaXMemcpyDeviceToDevice:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice);
+    break;
+  default:
+    std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl;
   }
+  if(error != cudaSuccess) cudaGetLastError();
+}
 
-  if (tidx==0) output[bidx] = summ[tidx];
-  
+void dsaXDeviceSynchronizeCuda() {
+  cudaDeviceSynchronize();
 }
+
diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp
new file mode 100644
index 0000000..c0c461c
--- /dev/null
+++ b/src/dsaX_interface.cpp
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+#include "dsaX_cuda_interface.h"
+#include "dsaX_ftd.h"
+
+using namespace std;
+
+void dsaXCorrelator(void *output_data, void *input_data) {  
+  dmem d;
+  int bf = 0;
+#if DSA_XENGINE_TARGET_CUDA
+  initializeCudaMemory(&d, bf);
+  d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  dcorrelator(&d);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void reorderInput(dmem *d) {
+#if DSA_XENGINE_TARGET_CUDA
+  reorderInputCuda(d);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void reorderOutput(dmem *d) {
+#if DSA_XENGINE_TARGET_CUDA  
+  reorderOutputCuda(d);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid) {
+#if DSA_XENGINE_TARGET_CUDA
+  transposeInputBeamformerCuda(input, output, dimBlock, dimGrid);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid) {
+#if DSA_XENGINE_TARGET_CUDA
+  transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb) {
+#if DSA_XENGINE_TARGET_CUDA
+  fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
+
+void sumBeam(unsigned char *input, float *output, int blocks, int tpb) {
+#if DSA_XENGINE_TARGET_CUDA
+  sumBeamCuda(input, output, blocks, tpb);
+#else
+  std::cout << "dsaX error: not implemented" << std::endl;
+#endif
+}
diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu
new file mode 100644
index 0000000..8f86525
--- /dev/null
+++ b/src/dsaX_magma_interface.cu
@@ -0,0 +1,23 @@
+#include <iostream>
+#include "dsaX.h"
+#include "dsaX_cuda_headers.h"
+
+#include "magma_v2.h"
+
+using namespace std;
+
+void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) {
+#if defined (DSA_XENGINE_TARGET_CUDA)
+#if defined (DSA_XENGINE_ENABLE_MAGMA)
+
+  // TO DO
+  
+#else
+  std::cout "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl;
+  exit(0);
+#endif
+#else
+  std::cout "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl;
+  exit(0);
+#endif
+}
diff --git a/src/dsaX_psrdada_utils.cpp b/src/dsaX_psrdada_utils.cpp
new file mode 100644
index 0000000..07c16e6
--- /dev/null
+++ b/src/dsaX_psrdada_utils.cpp
@@ -0,0 +1,11 @@
+#include "dsaX_psrdada_utils.h"
+
+void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out)
+{
+  if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in");
+  dada_hdu_destroy (in);
+  
+  if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out");
+  dada_hdu_destroy (out);
+  
+} 
diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp
index fc0345a..54e849a 100644
--- a/src/dsaX_utils.cpp
+++ b/src/dsaX_utils.cpp
@@ -1,39 +1,29 @@
 #include "dsaX_utils.h"
-#ifdef DSA_XENGINE_TARGET_CUDA
-#include "dsaX_cuda_headers.h"
-#endif
+#include "dsaX_enums.h"
+#include "dsaX_cuda_interface.h"
 
 void dsaXmemset(void *array, int ch, size_t n){
 #ifdef DSA_XENGINE_TARGET_CUDA
-  cudaMemset(array, ch, n);
+  dsaXmemsetCuda(array, ch, n);
 #else
-  emset(array, ch, n);
+  memset(array, ch, n);
 #endif
 }
 
-void dsaXmemcpyHostToDevice(void *array_device, void *array_host, size_t n){
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
 #ifdef DSA_XENGINE_TARGET_CUDA
   // Perform host to device memcopy on data
-  cudaMemcpy(array_device, array_host, n, cudaMemcpyHostToDevice);
+  dsaXmemcpyCuda(array_out, array_in, n, kind);
 #else  
-  memcpy(array_device, array_host, n);
+  memcpy(array_out, array_in, n);
 #endif
 }
 
-void dsaXmemcpyDeviceToHost(void *array_host, void *array_device, size_t n){
+void dsaXDeviceSynchronize() {
 #ifdef DSA_XENGINE_TARGET_CUDA
   // Perform host to device memcopy on data
-  cudaMemcpy(array_host, array_device, n, cudaMemcpyDeviceToHost);
-#else
-  memcpy(array_host, array_device, n);
-#endif
-}
-
-void dsaXmemcpyDeviceToDevice(void *array_copy_to, void *array_copy_from, size_t n){
-#ifdef DSA_XENGINE_TARGET_CUDA
-  // Perform device to device memcopy on data
-  cudaMemcpy(array_copy_to, array_copy_from, n, cudaMemcpyDeviceToDevice);
-#else
-  memcpy(array_copy_to, array_copy_from, n);
+  dsaXDeviceSynchronizeCuda();
+#else  
+  // NO OP
 #endif
 }
diff --git a/src/version.cpp b/src/version.cpp
new file mode 100644
index 0000000..1c8114b
--- /dev/null
+++ b/src/version.cpp
@@ -0,0 +1,5 @@
+#ifdef GITVERSION
+const char* gitversion = GITVERSION ;
+#else
+const char* gitversion;
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4a45a24..9320850 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,6 @@
 #DMH: fix include path
-include_directories(../include)
+include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CLI11_SOURCE_DIR}/include/CLI)
-add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
 
+add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
+target_link_libraries(dsaX_correlator_test dsax)
diff --git a/tests/CMakeLists.txt~ b/tests/CMakeLists.txt~
deleted file mode 100644
index f72156b..0000000
--- a/tests/CMakeLists.txt~
+++ /dev/null
@@ -1,5 +0,0 @@
-
-#include_directories(../include)
-include_directories(${CLI11_SOURCE_DIR}/src)
-add_executable(dsaX_beamformer_correlator_test dsaX_beamformer_correlator_test.cpp)
-
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
new file mode 100644
index 0000000..c067ced
--- /dev/null
+++ b/tests/command_line_params.cpp
@@ -0,0 +1,17 @@
+#include <command_line_params.h>
+
+void usage() {
+  fprintf (stdout,
+	   "dsaX_beamformer_correlator [options]\n"
+	   " -c core   bind process to CPU core [no default]\n"
+	   " -d send debug messages to syslog\n"
+	   " -i in_key [default REORDER_BLOCK_KEY]\n"
+	   " -o out_key [default XGPU_BLOCK_KEY]\n"
+	   " -b run beamformer [default is to run correlator]\n"
+	   " -h print usage\n"
+	   " -t binary file for test mode\n"
+	   " -f flagants file\n"
+	   " -a calib file\n"
+	   " -s start frequency (assumes -0.244140625MHz BW)\n");
+}
+
diff --git a/tests/dsaX_beamformer_correlator_test.cpp~ b/tests/dsaX_beamformer_correlator_test.cpp~
deleted file mode 100644
index 30184b3..0000000
--- a/tests/dsaX_beamformer_correlator_test.cpp~
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include <math.h>
-#include <string.h>
-
-// Include the dsaX.h header in your application
-//#include <dsaX.h>
-
-int main(int argc, char **argv) {
-
-  // startup syslog message
-  // using LOG_LOCAL0
-  openlog ("dsaX_bfCorr", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL0);
-  syslog (LOG_NOTICE, "Program started by User %d", getuid ());
-  
-  // DADA Header plus Data Unit 
-  dada_hdu_t* hdu_in = 0;
-  dada_hdu_t* hdu_out = 0;
-
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY;
-  key_t out_key = XGPU_BLOCK_KEY;
-  
-  // command line arguments
-  int core = -1;
-  int arg = 0;
-  int bf = 0;
-  int test = 0;
-  char ftest[200], fflagants[200], fcalib[200];
-  float sfreq = 1498.75;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
-    {
-      switch (arg)
-	{
-	case 'c':
-	  if (optarg)
-	    {
-	      core = atoi(optarg);
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-c flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'i':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &in_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-i flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'o':
-	  if (optarg)
-	    {
-	      if (sscanf (optarg, "%x", &out_key) != 1) {
-		syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-o flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 't':
-	  if (optarg)
-            {
-	      test = 1;
-	      syslog(LOG_INFO, "test mode");
-	      if (sscanf (optarg, "%s", &ftest) != 1) {
-		syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-t flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'a':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "read calib file %s",optarg);
-	      if (sscanf (optarg, "%s", &fcalib) != 1) {
-		syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-a flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'f':
-	  if (optarg)
-            {
-	      syslog(LOG_INFO, "reading flag ants file %s",optarg);
-	      if (sscanf (optarg, "%s", &fflagants) != 1) {
-		syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
-		return EXIT_FAILURE;
-	      }
-	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-f flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 's':
-	  if (optarg)
-            {
-	      sfreq = atof(optarg);
-	      syslog(LOG_INFO, "start freq %g",sfreq);
- 	      break;
-	    }
-	  else
-	    {
-	      syslog(LOG_ERR,"-s flag requires argument");
-	      usage();
-	      return EXIT_FAILURE;
-	    }
-	case 'd':
-	  //DEBUG=1;
-	  syslog (LOG_DEBUG, "Will excrete all debug messages");
-	  break;
-	case 'b':
-	  bf=1;
-	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
-	  break;
-	case 'h':
-	  usage();
-	  return EXIT_SUCCESS;
-	}
-    }
-
-  // Bind to cpu core
-  if (core >= 0) {
-    if (dada_bind_thread_to_core(core) < 0)
-      syslog(LOG_ERR,"failed to bind to core %d", core);
-    syslog(LOG_NOTICE,"bound to core %d", core);
-  }
-
-  /*
-  // allocate device memory
-  dmem d;
-  initialize_device_memory(&d,bf);
-
-  // set up for beamformer
-  FILE *ff;
-  int iii;
-  if (bf) {
-
-    if (!(ff=fopen(fflagants,"r"))) {
-      syslog(LOG_ERR,"could not open flagants file\n");
-      exit(1);
-    }
-    d.nflags=0;
-    while (!feof(ff)) {
-      fscanf(ff,"%d\n",&d.flagants[iii]);
-      d.nflags++;
-    }
-    fclose(ff);
-
-    if (!(ff=fopen(fcalib,"rb"))) {
-      syslog(LOG_ERR,"could not open calibss file\n");
-      exit(1);
-    }
-    fread(d.h_winp,NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2,4,ff);
-    fclose(ff);
-
-    for (iii=0;iii<(NCHAN_PER_PACKET/8);iii++)
-      d.h_freqs[iii] = 1e6*(sfreq-iii*250./1024.);
-    cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
-
-    // calculate weights
-    calc_weights(&d);
-    
-  }
-
-  // test mode
-  FILE *fin, *fout;
-  uint64_t output_size;
-  char * output_data;//, * o1;
-  if (test) {
-
-    // read one block of input data    
-    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    for (int i=0;i<512;i++) {
-      fin = fopen(ftest,"rb");
-      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
-      fclose(fin);
-    }
-
-    // run correlator or beamformer, and output data
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
-
-      fout = fopen("output.dat","wb");
-      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
-      fclose(fout);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
-
-      // output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
-      // o1 = (char *)malloc(output_size);
-      // cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);
-	
-      
-
-      fout = fopen("output.dat","wb");
-      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
-      //fwrite(o1,1,output_size,fout);
-      fclose(fout);
-    }
-
-	
-    // free
-    free(d.h_input);
-    free(output_data);
-    //free(o1);
-    deallocate_device_memory(&d,bf);
-
-    exit(1);
-  }
-  
-
-
-  
-  // DADA stuff
-  
-  syslog (LOG_INFO, "creating in and out hdus");
-  
-  hdu_in  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_in, in_key);
-  if (dada_hdu_connect (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not connect to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_read (hdu_in) < 0) {
-    syslog (LOG_ERR,"could not lock to dada buffer in");
-    return EXIT_FAILURE;
-  }
-  
-  hdu_out  = dada_hdu_create (0);
-  dada_hdu_set_key (hdu_out, out_key);
-  if (dada_hdu_connect (hdu_out) < 0) {
-    syslog (LOG_ERR,"could not connect to output  buffer");
-    return EXIT_FAILURE;
-  }
-  if (dada_hdu_lock_write(hdu_out) < 0) {
-    syslog (LOG_ERR, "could not lock to output buffer");
-    return EXIT_FAILURE;
-  }
-
-  uint64_t header_size = 0;
-
-  // deal with headers
-  char * header_in = ipcbuf_get_next_read (hdu_in->header_block, &header_size);
-  if (!header_in)
-    {
-      syslog(LOG_ERR, "could not read next header");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  if (ipcbuf_mark_cleared (hdu_in->header_block) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block cleared");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  
-  char * header_out = ipcbuf_get_next_write (hdu_out->header_block);
-  if (!header_out)
-    {
-      syslog(LOG_ERR, "could not get next header block [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-  memcpy (header_out, header_in, header_size);
-  if (ipcbuf_mark_filled (hdu_out->header_block, header_size) < 0)
-    {
-      syslog (LOG_ERR, "could not mark header block filled [output]");
-      dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-      return EXIT_FAILURE;
-    }
-
-  syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
-  
-  // get block sizes and allocate memory
-  uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
-  if (bf==0) 
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
-  else
-    syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS);
-  uint64_t  bytes_read = 0;
-  //char * block;
-  char * output_buffer;
-  output_buffer = (char *)malloc(block_out);
-  uint64_t written, block_id;
-  
-  // get things started
-  bool observation_complete=0;
-  //bool started = 0;
-  syslog(LOG_INFO, "starting observation");
-  int blocks = 0;
-  //clock_t begin, end;
-  //double time_spent;
-  
-  while (!observation_complete) {
-
-    if (DEBUG) syslog(LOG_INFO,"reading block");    
-    
-    // open block
-    d.h_input = ipcio_open_block_read (hdu_in->data_block, &bytes_read, &block_id);
-
-    // do stuff
-    //begin = clock();
-    // loop
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_output,block_out,cudaMemcpyDeviceToHost);
-    }
-    else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      cudaMemcpy(output_buffer,d.d_bigpower,block_out,cudaMemcpyDeviceToHost);
-    }
-    //end = clock();
-    //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
-    
-    // write to output
-
-    // write to host
-    written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
-    if (written < block_out)
-      {
-	syslog(LOG_ERR, "main: failed to write all data to datablock [output]");
-	dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-	return EXIT_FAILURE;
-      }
-    
-    if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
-    blocks++;
-    // loop end
-    
-      
-    // finish up
-    if (bytes_read < block_size)
-      observation_complete = 1;
-    
-    ipcio_close_block_read (hdu_in->data_block, bytes_read);
-    
-  }
-
-  // finish up
-  free(output_buffer);
-  deallocate_device_memory(&d,bf);
-  dsaX_dbgpu_cleanup (hdu_in, hdu_out);
-  
-  return 0;
-  */
-}
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index b0560fc..a1d96c5 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -7,8 +7,8 @@
 #include <string.h>
 #include <syslog.h>
 
-// Include the dsaX_interface.h header in your application
-#include <dsaX_interface.h>
+// Include the dsaX.h header in your application
+#include <dsaX.h>
 
 using namespace std;
 
@@ -110,12 +110,11 @@ int main(int argc, char **argv) {
 	  return EXIT_FAILURE;
 	}
 	break;
-      } else
-	{
-	  syslog(LOG_ERR,"-f flag requires argument");
-	  usage();
-	  return EXIT_FAILURE;
-	}
+      } else {
+	syslog(LOG_ERR,"-f flag requires argument");
+	usage();
+	return EXIT_FAILURE;
+      }
     case 's':
       if (optarg) {
 	sfreq = atof(optarg);
@@ -151,45 +150,42 @@ int main(int argc, char **argv) {
   std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
   std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
   
-#if 0
-  dsaX_init();
-  
-  // allocate device memory
-  dmem d;
-  initialize_device_memory(&d, bf);
-
+  //dsaX_init();  
   FILE *fin, *fout;
-  uint64_t output_size;
-  char * output_data;
+  std::cout << "Creating float output_array of size " << sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl;
+  uint64_t output_size = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4;
+  std::cout << "Creating char input_array of size " << sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 << std::endl;
+  uint64_t input_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
 
+  float *output_data = (float *)malloc(output_size);
+  char *input_data = (char *)malloc(input_size);
+  
   // read one block of input data    
-  d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   for (int i=0;i<512;i++) {
     fin = fopen(ftest,"rb");
-    fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
+    fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin);
     fclose(fin);
   }
+
+  // Peek at input data (delete after development is complete)
+  for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; 
   
-  // run correlator or beamformer, and output data
+  // run correlator and record output data
   syslog(LOG_INFO,"run correlator");
-  dcorrelator(&d);
-  syslog(LOG_INFO,"copy to host");
-  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-  output_data = (char *)malloc(output_size);
-  cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+  dsaXCorrelator((void*)output_data, (void*)input_data);
+
+  // Peek at output data (delete after development is complete)
+  //for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
   
   fout = fopen("output.dat","wb");
-  fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+  fwrite((float *)output_data, sizeof(float), NBASE*NCHAN_PER_PACKET*2*2, fout);
   fclose(fout);
   
   // free
-  free(d.h_input);
+  free(input_data);
   free(output_data);
-  //free(o1);
-  deallocate_device_memory(&d,bf);
-  dsaX_end();
+  //dsaX_end();
   
   return 0;
-#endif
 }
 

From 047722f70f3f63be39ef4b4091c54031a17a0792 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Tue, 25 Jun 2024 22:57:18 -0700
Subject: [PATCH 21/30] build tweaks

---
 CMakeLists.txt                 | 2 +-
 include/dsaX_cuda_headers.h    | 2 ++
 include/dsaX_magma_headers.h   | 5 +++++
 src/dsaX_magma_interface.cu    | 7 +++----
 tests/dsaX_correlator_test.cpp | 2 +-
 5 files changed, 12 insertions(+), 6 deletions(-)
 create mode 100644 include/dsaX_magma_headers.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acfd1a3..e3cf1b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,7 +242,7 @@ if(DSA_XENGINE_DOWNLOAD_PSRDADA)
   FetchContent_MakeAvailable(PSRDada)
 else()
   # Find and link to local install
-  find_package(psrdada REQUIRED)
+  find_package(PSRDada REQUIRED)
 endif()
 
 # Get CLI11 dependency
diff --git a/include/dsaX_cuda_headers.h b/include/dsaX_cuda_headers.h
index acc838d..333a5bc 100644
--- a/include/dsaX_cuda_headers.h
+++ b/include/dsaX_cuda_headers.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#if defined (DSA_XENGINE_TARGET_CUDA)
 #include <cuda.h>
 #include "cuda_fp16.h"
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#endif
diff --git a/include/dsaX_magma_headers.h b/include/dsaX_magma_headers.h
new file mode 100644
index 0000000..e9750c8
--- /dev/null
+++ b/include/dsaX_magma_headers.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#if defined (DSA_XENGINE_ENABLE_MAGMA) 
+#include "magma_v2.h"
+#endif
diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu
index 8f86525..14a8f4f 100644
--- a/src/dsaX_magma_interface.cu
+++ b/src/dsaX_magma_interface.cu
@@ -1,8 +1,7 @@
 #include <iostream>
 #include "dsaX.h"
 #include "dsaX_cuda_headers.h"
-
-#include "magma_v2.h"
+#include "dsaX_magma_headers.h"
 
 using namespace std;
 
@@ -13,11 +12,11 @@ void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void
   // TO DO
   
 #else
-  std::cout "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl;
+  std::cout << "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl;
   exit(0);
 #endif
 #else
-  std::cout "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl;
+  std::cout << "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl;
   exit(0);
 #endif
 }
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index a1d96c5..e9d0192 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -168,7 +168,7 @@ int main(int argc, char **argv) {
   }
 
   // Peek at input data (delete after development is complete)
-  for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; 
+  //for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; 
   
   // run correlator and record output data
   syslog(LOG_INFO,"run correlator");

From 95f751203a6f0bd9802da08e9927cc017482e4f1 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Wed, 26 Jun 2024 16:54:34 -0700
Subject: [PATCH 22/30] fix bug in beamformer, add util to inspect char length
 data in test

---
 src/dsaX_beamformer.cpp        |  2 +-
 tests/dsaX_correlator_test.cpp | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp
index f395b0e..f82f677 100644
--- a/src/dsaX_beamformer.cpp
+++ b/src/dsaX_beamformer.cpp
@@ -105,7 +105,7 @@ void dbeamformer(dmem *d) {
     // Reuse dimBlock
     //DMH: Abstract kernel launch parameters
     dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16;
-    dimGrid[0] = (NCHAN_PER_PACKET/8)/16;
+    dimGrid[1] = (NCHAN_PER_PACKET/8)/16;
     begin = clock();
     transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid);
     end = clock();
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index a1d96c5..b705975 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -26,6 +26,13 @@ void usage() {
 	   " -s start frequency (assumes -0.244140625MHz BW)\n");
 }
 
+void inspectPackedData(char input) {
+  
+  std::cout << "vals = (" << (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4) << ",";
+  
+  std::cout << (float)((char)((   (unsigned char)(input) & (unsigned char)(240)  )) >> 4) << ")" << std::endl;
+}
+
 int main(int argc, char **argv) {
 
   // data block HDU keys
@@ -168,14 +175,14 @@ int main(int argc, char **argv) {
   }
 
   // Peek at input data (delete after development is complete)
-  for (int i=0; i<10; i++) if(input_data[i] != 0) std::cout << "input[" << i <<"] = " << (float)input_data[i] << std::endl; 
+  //for (int i=0; i<input_size; i++) inspectPackedData(input_data[i]);
   
   // run correlator and record output data
   syslog(LOG_INFO,"run correlator");
   dsaXCorrelator((void*)output_data, (void*)input_data);
 
   // Peek at output data (delete after development is complete)
-  //for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
+  //for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
   
   fout = fopen("output.dat","wb");
   fwrite((float *)output_data, sizeof(float), NBASE*NCHAN_PER_PACKET*2*2, fout);
@@ -188,4 +195,3 @@ int main(int argc, char **argv) {
   
   return 0;
 }
-

From 4a9b7b1031887e4c7f535a9da7e082ecfc3999d0 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Thu, 27 Jun 2024 18:54:33 -0700
Subject: [PATCH 23/30] New correlator function reproduces legacy
 implementation data. Added CLI for command line parsing

---
 include/dsaX.h                 |   8 +
 include/dsaX_cuda_interface.h  |   2 +-
 include/dsaX_cuda_kernels.h    |  73 +++++---
 include/dsaX_enums.h           |   3 +-
 legacy/Makefile                |   4 +-
 legacy/dsaX_bfCorr.cu          | 298 +++++++++++++++++++++-----------
 src/dsaX_correlator.cpp        |  64 ++++++-
 src/dsaX_cublas_interface.cu   | 110 ++++++++----
 src/dsaX_cuda_interface.cu     |  30 +++-
 src/dsaX_interface.cpp         |  46 +++++
 tests/CMakeLists.txt           |   4 +-
 tests/command_line_params.cpp  |  55 ++++--
 tests/dsaX_correlator_test.cpp | 299 ++++++++++++++++++---------------
 13 files changed, 680 insertions(+), 316 deletions(-)

diff --git a/include/dsaX.h b/include/dsaX.h
index 699fe37..6083bb2 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -5,6 +5,8 @@
 #include "dsaX_def.h"
 #include "dsaX_enums.h"
 
+#define OLD_BLAS
+
 // Structure that carries BLAS parameters
 typedef struct dsaXBLASParam_s {  
   size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
@@ -38,6 +40,8 @@ typedef struct dsaXBLASParam_s {
   
 } dsaXBLASParam;
 
+void printDsaXBLASParam(const dsaXBLASParam param);
+
 // required to prevent overflow in corr matrix multiply
 #define halfFac 4
 
@@ -76,6 +80,10 @@ typedef struct dmem {
   
 } dmem;
 
+void dsaXInit(int device_ordinal = 0);
+
+void inspectPackedData(char input, int i, bool non_zero = false);
+
 void dsaXCorrelator(void *output_data, void *input_data);
 
 void reorderOutput(dmem *d);
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
index cee1581..d9f2278 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/dsaX_cuda_interface.h
@@ -6,7 +6,7 @@
 #include "dsaX_enums.h"
 #include "dsaX.h"
 
-
+void dsaXInitCuda(int dev);
 
 void initializeCudaMemory(dmem *d, int bf);
 
diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h
index db09baa..7fef077 100644
--- a/include/dsaX_cuda_kernels.h
+++ b/include/dsaX_cuda_kernels.h
@@ -2,6 +2,13 @@
 
 #include "dsaX_cuda_headers.h"
 
+__device__ void inspectPackedDataInKernel(char input, int i) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+  
+  if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im);
+}
+
 // KERNELS
 // DMH: Abstract hardcoded launch parameters
 __global__ void transpose_input_beamformer(double *idata, double *odata) {
@@ -40,7 +47,7 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind
   
   int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
   int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
+  int idx = blockDim.x * bidx + tidx;
   
   int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
   int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
@@ -74,9 +81,11 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
   int y = blockIdx.y * 32 + threadIdx.y;
   int width = gridDim.x * 32;
 
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
+  for (int j = 0; j < 32; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
   __syncthreads();
 
   x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
@@ -88,14 +97,43 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
 
 }
 
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_char(char * idata, char * odata) {
+  
+  __shared__ char tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+  }
+}
+
+
 // kernel to fluff input
 // run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
 __global__ void corr_input_copy(char *input, half *inr, half *ini) {
 
-  int bidx = blockIdx.x;  // assume NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128
-  int tidx = threadIdx.x; // assume 128 threads per block
-  int iidx = bidx*128+tidx;
-
+  int bidx = blockIdx.x;  
+  int tidx = threadIdx.x; 
+  int iidx = blockDim.x * bidx + tidx;
+  
   // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
   // to get real part 4 bit data.
   // 0000rrrr
@@ -118,23 +156,18 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) {
   // Cast to float and use CUDA intrinsic to cast to signed half
   ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
 
-  // Both results should be half (FP16) integers between -8 and 7.
-  half re = inr[iidx];
-  half im = ini[iidx];
-  half lim = 2.;
-  if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
-    //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
-  }
+  //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
 }
 
 // kernel to populate an instance of weights matrix
 // [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
 // run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+// TUNABLE
 __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
   
   int bidx = blockIdx.x;
   int tidx = threadIdx.x;
-  int inidx = bidx*128+tidx;  
+  int inidx = 128 * bidx + tidx;  
   
   // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
   
@@ -183,10 +216,10 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa
 // run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
 __global__ void fluff_input_beamformer(char * input, half * dr, half * di) {
   
-  int bidx = blockIdx.x; // assume NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128
-  int tidx = threadIdx.x; // assume 128
-  int idx = bidx*128+tidx;
-
+  int bidx = blockIdx.x; 
+  int tidx = threadIdx.x;
+  int idx = blockDim.x * bidx + tidx;
+  
   dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
   di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
 
diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h
index 30fe3c6..4e8351f 100644
--- a/include/dsaX_enums.h
+++ b/include/dsaX_enums.h
@@ -12,7 +12,8 @@ typedef enum dsaXError_t {
 typedef enum dsaXBLASOperation_s {				 
   DSA_BLAS_OP_N = 0, // No transpose
   DSA_BLAS_OP_T = 1, // Transpose only
-  DSA_BLAS_OP_C = 2, // Conjugate transpose
+  DSA_BLAS_OP_A = 2, // Adjoint imaginary, no transpose
+  DSA_BLAS_OP_C = 3, // Conjugate transpose
   DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM
 } dsaXBLASOperation;
 
diff --git a/legacy/Makefile b/legacy/Makefile
index 0de1991..4cc2fee 100644
--- a/legacy/Makefile
+++ b/legacy/Makefile
@@ -4,13 +4,13 @@ CC=gcc
 CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc
 CDEPS1=dsaX_def.h dsaX_capture_manythread.h
 CDEPS2=dsaX_def.h dsaX_capture.h
-LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu
+LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib #-lcfitsio -lsigproc -lxgpu
 
 #LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap
 #CDEPS3=dsaX_def.h dsaX_capture_pcap.h
 
 CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++
-CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14
+CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/dmhowart/install/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 -L/home/dmhowart/install/lib
 
 
 .DEFAULT_GOAL := all
diff --git a/legacy/dsaX_bfCorr.cu b/legacy/dsaX_bfCorr.cu
index 25b9262..265226b 100644
--- a/legacy/dsaX_bfCorr.cu
+++ b/legacy/dsaX_bfCorr.cu
@@ -45,13 +45,33 @@ using std::endl;
 #define sep 1.0 // arcmin
 
 /* global variables */
-int DEBUG = 1;
+int DEBUG = 0;
+
+__device__ void inspectPackedDataInKernel(char input, int i) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+  if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im);
+}
+
+void inspectPackedData(char input, int i, bool non_zeros) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+  if(non_zeros) {
+    if(re != 0 || im != 0)
+      std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  } else {
+    std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  }
+}
+
 
 // define structure that carries around device memory
 typedef struct dmem {
 
   // initial data and streams
-  char * h_input; // host input pointer
+  char * h_input, * h_pinned_input; // host input pointer
   char * d_input, * d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
   
   // correlator pointers
@@ -76,15 +96,50 @@ typedef struct dmem {
 
   // timing
   float cp, prep, cubl, outp;
+
+  // obs dec
+  float obsdec;
   
 } dmem;
 
+/*! register the data_block in the hdu via cudaHostRegister */
+int dada_cuda_dbregister (dada_hdu_t * hdu)
+{
+  ipcbuf_t * db = (ipcbuf_t *) hdu->data_block;
+
+  // ensure that the data blocks are SHM locked
+  if (ipcbuf_lock (db) < 0)
+  {
+    syslog(LOG_ERR,"dada_dbregister: ipcbuf_lock failed");
+    return -1;
+  }
+
+  size_t bufsz = db->sync->bufsz;
+  unsigned int flags = 0;
+  cudaError_t rval;
+
+  // lock each data block buffer as cuda memory
+  uint64_t ibuf;
+  for (ibuf = 0; ibuf < db->sync->nbufs; ibuf++)
+  {
+    rval = cudaHostRegister ((void *) db->buffer[ibuf], bufsz, flags);
+    if (rval != cudaSuccess)
+    {
+      syslog(LOG_ERR,"dada_dbregister:  cudaHostRegister failed");
+      return -1;
+    }
+  }
+  
+  return 0;
+}
+
 
 // allocate device memory
 void initialize(dmem * d, int bf) {
   
   // for correlator
   if (bf==0) {
+    cudaMallocHost((void**)&d->h_pinned_input, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
     cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
     cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
     cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
@@ -94,6 +149,14 @@ void initialize(dmem * d, int bf) {
     cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
     cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
     cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+
+    // timers
+    d->cp = 0.;
+    d->prep = 0.;
+    d->outp = 0.;
+    d->cubl = 0.;
+
+    
   }
 
   // for beamformer
@@ -141,6 +204,7 @@ void deallocate(dmem * d, int bf) {
     cudaFree(d->d_outi);
     cudaFree(d->d_tx_outr);
     cudaFree(d->d_tx_outi);
+    cudaFreeHost(d->h_pinned_input);
   }
   if (bf==1) {
     cudaFree(d->d_tx);
@@ -195,7 +259,8 @@ fprintf (stdout,
 	 " -t binary file for test mode\n"
 	 " -f flagants file\n"
 	 " -a calib file\n"
-	 " -s start frequency (assumes -0.244140625MHz BW)\n");
+	 " -s start frequency (assumes -0.244140625MHz BW)\n"
+	 " -g observing DEC in degrees (default 71.66)\n");
 }
 
 // kernel to fluff input
@@ -209,6 +274,7 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) {
   inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
   ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
 
+  //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
 }
 
 
@@ -224,18 +290,21 @@ __global__ void transpose_matrix_char(char * idata, char * odata) {
   int y = blockIdx.y * 32 + threadIdx.y;
   int width = gridDim.x * 32;
 
-  for (int j = 0; j < 32; j += 8)
+  for (int j = 0; j < 32; j += 8) {
      tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
+     //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
   __syncthreads();
 
   x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
   y = blockIdx.x * 32 + threadIdx.y;
   width = gridDim.y * 32;
 
-  for (int j = 0; j < 32; j += 8)
+  for (int j = 0; j < 32; j += 8) {
      odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
+     //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x);
+  }
 }
 
 // arbitrary transpose kernel
@@ -264,34 +333,8 @@ __global__ void transpose_matrix_float(half * idata, half * odata) {
 
 }
 
-// arbitrary transpose kernel
-// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
-// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
-// here, width is the dimension of the fastest index
-template <typename in_prec, typename out_prec> __global__ void transpose_matrix_template(in_prec * idata, out_prec * odata) {
-
-  __shared__ in_prec tile[32][33];
-  
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
-
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
-}
-
 
-// function to copy and reorder d_input to d_r and d_i
+// function to copy amd reorder d_input to d_r and d_i
 // input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
 // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
 // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
@@ -300,7 +343,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) {
 
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix_char<<<dimGrid,dimBlock>>>(input,tx);
+  transpose_matrix_char<<<dimGrid,dimBlock>>>(input, tx);
+  // DMH good
   /*
   // set up for geam
   cublasHandle_t cublasH = NULL;
@@ -452,21 +496,33 @@ void reorder_output(dmem * d) {
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
 void dcorrelator(dmem * d) {
 
+  // timing
+  // copy, prepare, cublas, output
+  clock_t begin, end;
+
   // zero out output arrays
   cudaMemset(d->d_outr,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
   cudaMemset(d->d_outi,0,NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(half));
   cudaMemset(d->d_output,0,NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
   
-  // copy to device
+  // copy to device  
+  //memcpy(d->h_pinned_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  begin = clock();
   cudaMemcpy(d->d_input,d->h_input,NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,cudaMemcpyHostToDevice);
-
+  end = clock();
+  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+  
   // reorder input
+  begin = clock();
   reorder_input(d->d_input,d->d_tx,d->d_r,d->d_i);
-
+  
   // not sure if essential
   cudaDeviceSynchronize();
+  end = clock();
+  d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
   
   // set up for gemm
+  begin = clock();
   cublasHandle_t cublasH = NULL;
   cudaStream_t stream = NULL;
   cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
@@ -494,6 +550,10 @@ void dcorrelator(dmem * d) {
   const int batchCount = NCHAN_PER_PACKET*2*2*halfFac;
 
   // run strided batched gemm
+  // M^* M^T
+  // (a - ib)(a + ib)^T
+  // (aaT + bbT) + i(abT - bTa)
+  
   // ac
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
 			    &alpha,d->d_r,lda,strideA,
@@ -521,13 +581,18 @@ void dcorrelator(dmem * d) {
 
   // shown to be essential
   cudaDeviceSynchronize();
+  end = clock();
+  d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
 
   // destroy stream
   cudaStreamDestroy(stream);
   cublasDestroy(cublasH);
   
   // reorder output data
+  begin = clock();
   reorder_output(d);
+  end = clock();
+  d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
   
 }
 
@@ -575,8 +640,8 @@ __global__ void fluff_input_bf(char * input, half * dr, half * di) {
   int tidx = threadIdx.x; // assume 128
   int idx = bidx*128+tidx;
 
-  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
-  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+  dr[idx] = __float2half(0.035*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+  di[idx] = __float2half(0.035*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
   
 }
 
@@ -606,7 +671,7 @@ __global__ void transpose_scale_bf(half * ir, half * ii, unsigned char * odata)
   width = gridDim.y * 16;
 
   for (int j = 0; j < 16; j += 8)
-    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]);
 
 }
 
@@ -775,7 +840,7 @@ void dbeamformer(dmem * d) {
 
 // kernel to populate an instance of weights matrix [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
 // run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
-__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs, float dec) {
 
   int bidx = blockIdx.x;
   int tidx = threadIdx.x;
@@ -813,7 +878,7 @@ __global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, floa
     //wi[inidx] = __float2half(calibs[widx+1]);
   }
   if (iArm==1) {
-    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    theta = sep*(127.-bm*1.)*PI/10800.-(PI/180.)*dec; // radians
     afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
     twr = cos(afac*antpos_n[a+48*iArm]);
     twi = sin(afac*antpos_n[a+48*iArm]);
@@ -845,8 +910,8 @@ void calc_weights(dmem * d) {
   // deal with antpos and calibs
   int iant, found;
   for (int i=0;i<NANTS;i++) {
-    antpos_e[i] = d->h_winp[2*i];
-    antpos_n[i] = d->h_winp[2*i+1];
+    antpos_e[i] = d->h_winp[i];
+    antpos_n[i] = d->h_winp[i+NANTS];
   }
   for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
 
@@ -865,10 +930,10 @@ void calc_weights(dmem * d) {
       calibs[2*i+1] /= wnorm;
     }
 
-    //if (found==1) {
-    //calibs[2*i] = 0.;
-    //calibs[2*i+1] = 0.;
-    //}
+    if (found==1) {
+      calibs[2*i] = 0.;
+      calibs[2*i+1] = 0.;
+    }
   }
 
   //for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) printf("%f %f\n",calibs[2*i],calibs[2*i+1]);
@@ -878,7 +943,7 @@ void calc_weights(dmem * d) {
   cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
 
   // run kernel to populate weights matrix
-  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs);  
+  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128,128>>>(d_antpos_e,d_antpos_n,d_calibs,d->weights_r,d->weights_i,d->d_freqs,37.23-(d->obsdec));  
   
   // free stuff
   cudaFree(d_antpos_e);
@@ -892,7 +957,7 @@ void calc_weights(dmem * d) {
 
 // MAIN
 
-int main (int argc, char *argv[]) {
+int main (int argc, char *argv[]) {  
 
   cudaSetDevice(1);
   
@@ -914,11 +979,12 @@ int main (int argc, char *argv[]) {
   int arg = 0;
   int bf = 0;
   int test = 0;
+  float mydec = 71.66;
   char ftest[200], fflagants[200], fcalib[200];
   float sfreq = 1498.75;
 
   
-  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1)
+  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:g:bdh")) != -1)
     {
       switch (arg)
 	{
@@ -1026,12 +1092,26 @@ int main (int argc, char *argv[]) {
 	      usage();
 	      return EXIT_FAILURE;
 	    }
+	case 'g':
+	  if (optarg)
+            {
+	      mydec = atof(optarg);
+	      syslog(LOG_INFO, "obs dec %g",mydec);
+ 	      break;
+	    }
+	  else
+	    {
+	      syslog(LOG_ERR,"-g flag requires argument");
+	      usage();
+	      return EXIT_FAILURE;
+	    }
 	case 'd':
 	  DEBUG=1;
 	  syslog (LOG_DEBUG, "Will excrete all debug messages");
 	  break;
 	case 'b':
 	  bf=1;
+	  cudaSetDevice(0);
 	  syslog (LOG_NOTICE, "Running beamformer, NOT correlator");
 	  break;
 	case 'h':
@@ -1080,55 +1160,84 @@ int main (int argc, char *argv[]) {
     cudaMemcpy(d.d_freqs,d.h_freqs,sizeof(float)*(NCHAN_PER_PACKET/8),cudaMemcpyHostToDevice);
 
     // calculate weights
+    d.obsdec = mydec;
     calc_weights(&d);
     
   }
 
   // test mode
   FILE *fin, *fout;
-  uint64_t output_size;
+  uint64_t sz, output_size, in_block_size, rd_size;
+  in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
   char * output_data, * o1;
+  int nreps = 1, nchunks = 1;
   if (test) {
 
-    // read one block of input data    
-    d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    for (int i=0;i<512;i++) {
-      fin = fopen(ftest,"rb");
-      fread(d.h_input+i*4*NANTS*NCHAN_PER_PACKET*2*2,4*NANTS*NCHAN_PER_PACKET*2*2,1,fin);
-      fclose(fin);
-    }
+    // read one block of input data
 
-    // run correlator or beamformer, and output data
-    if (bf==0) {
-      if (DEBUG) syslog(LOG_INFO,"run correlator");
-      dcorrelator(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+    // get size of file
+    fin=fopen(ftest,"rb");
+    fseek(fin,0L,SEEK_END);
+    sz = ftell(fin);
+    rewind(fin);
 
-      fout = fopen("output.dat","wb");
-      fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
-      fclose(fout);
+    // figure out how many reps and chunks to read with
+    if (sz>in_block_size) {
+      nreps = (int)(sz/in_block_size);
+      rd_size = in_block_size;
     }
     else {
-      if (DEBUG) syslog(LOG_INFO,"run beamformer");
-      dbeamformer(&d);
-      if (DEBUG) syslog(LOG_INFO,"copy to host");
-      output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
-      output_data = (char *)malloc(output_size);
-      cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);
+      nchunks = (int)(in_block_size/sz);
+      rd_size =	sz;
+    }
 
-      /*output_size = 2*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8);
-      o1 = (char *)malloc(output_size);
-      cudaMemcpy(o1,d.weights_r,output_size,cudaMemcpyDeviceToHost);*/
-	
-      
+    // allocate input
+    d.h_input = (char *)malloc(sizeof(char)*in_block_size);
+
+    std::cout << "Size of input = " << in_block_size << std::endl;
+    
+    // loop over reps and chunks
+    for (int reps=0; reps<nreps; reps++) {
+
+      for (int chunks=0;chunks<nchunks;chunks++) {
 
-      fout = fopen("output.dat","wb");
-      fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
-      //fwrite(o1,1,output_size,fout);
-      fclose(fout);
+	// read input file
+	if (chunks>0) rewind(fin);
+	fread(d.h_input+chunks*rd_size,rd_size,1,fin);
+
+	std::cout << "Input peek " << std::endl;
+	//for (int i=0; i<8; i++) inspectPackedData(d.h_input[i], i);
+	
+	// run correlator or beamformer, and output data
+	if (bf==0) {
+	  if (DEBUG) syslog(LOG_INFO,"run correlator");
+	  dcorrelator(&d);
+	  if (DEBUG) syslog(LOG_INFO,"copy to host");
+	  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+	  output_data = (char *)malloc(output_size);
+	  cudaMemcpy(output_data, d.d_output, output_size, cudaMemcpyDeviceToHost);
+
+	  std::cout << "Output peek " << std::endl;
+	  for(int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) inspectPackedData(output_data[i], i, true);
+	  
+	  fout = fopen("output.dat","ab");
+	  fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);	  
+	  fclose(fout);
+	}
+	else {
+	  if (DEBUG) syslog(LOG_INFO,"run beamformer");
+	  dbeamformer(&d);
+	  if (DEBUG) syslog(LOG_INFO,"copy to host");
+	  output_size = (NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*NBEAMS;
+	  output_data = (char *)malloc(output_size);
+	  cudaMemcpy(output_data,d.d_bigpower,output_size,cudaMemcpyDeviceToHost);	
+
+	  fout = fopen("output.dat","ab");
+	  fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
+	  fclose(fout);
+	}
+	exit(0);
+      }
     }
 
 	
@@ -1137,7 +1246,7 @@ int main (int argc, char *argv[]) {
     free(output_data);
     free(o1);
     deallocate(&d,bf);
-
+    fclose(fin);
     exit(1);
   }
   
@@ -1203,11 +1312,14 @@ int main (int argc, char *argv[]) {
     }
 
   syslog(LOG_INFO,"dealt with dada stuff - now in LISTEN state");  
+
+  // register input with gpu
+  dada_cuda_dbregister(hdu_in);
   
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %d %d\n",block_size,block_out);
   if (bf==0) 
     syslog(LOG_INFO, "main: EXPECT input and output block sizes %d %d\n",NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2,NBASE*NCHAN_PER_PACKET*2*2*4);
   else
@@ -1235,7 +1347,6 @@ int main (int argc, char *argv[]) {
 
     // do stuff
     //begin = clock();
-    // loop
     if (bf==0) {
       if (DEBUG) syslog(LOG_INFO,"run correlator");
       dcorrelator(&d);
@@ -1250,11 +1361,10 @@ int main (int argc, char *argv[]) {
     }
     //end = clock();
     //time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-    cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
+    //cout << "spent time " << d.cp << " " << d.prep << " " << d.cubl << " " << d.outp << " s" << endl;
     
     // write to output
-
-    // write to host
+    
     written = ipcio_write (hdu_out->data_block, (char *)(output_buffer), block_out);
     if (written < block_out)
       {
@@ -1265,13 +1375,13 @@ int main (int argc, char *argv[]) {
     
     if (DEBUG) syslog(LOG_INFO, "written block %d",blocks);	    
     blocks++;
-    // loop end
+
     
       
     // finish up
     if (bytes_read < block_size)
       observation_complete = 1;
-    
+
     ipcio_close_block_read (hdu_in->data_block, bytes_read);
     
   }
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
index 4611939..7a6882c 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/dsaX_correlator.cpp
@@ -18,23 +18,31 @@ Workflow is similar for BF and corr applications
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
 // DMH CUDA references excised.
 void dcorrelator(dmem *d) {
-
-  // copy to device
-  dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
   
   // zero out output arrays
   dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
   dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
   dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+
+  // copy to device
+  dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
   
   // reorder input into real and imaginary arrays of 2 byte data
   reorderInput(d);
-
+  
   dsaXBLASParam blas_param;
+  blas_param.struct_size = sizeof(blas_param);
+  blas_param.blas_type = DSA_BLAS_GEMM;
+
   // gemm settings
   // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
-  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS] 
-  blas_param.trans_a = DSA_BLAS_OP_N;
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
+
+#if defined OLD_BLAS
+  std::cout << "Old params" << std::endl;
+  
+  blas_param.data_order = DSA_BLAS_DATAORDER_COL;
+  blas_param.trans_a = DSA_BLAS_OP_A;
   blas_param.trans_b = DSA_BLAS_OP_T;
   blas_param.m = NANTS;
   blas_param.n = NANTS;
@@ -48,9 +56,53 @@ void dcorrelator(dmem *d) {
   blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
   blas_param.c_stride = NANTS*NANTS;
   blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#else
+  std::cout << "My params" << std::endl;
+  
+  blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
+  blas_param.trans_a = DSA_BLAS_OP_C;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#endif
+
+  // Swap A and B if in row order
+  if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
+    std::swap(blas_param.m, blas_param.n);
+    std::swap(blas_param.lda, blas_param.ldb);
+    std::swap(blas_param.trans_a, blas_param.trans_b);
+    std::swap(blas_param.a_offset, blas_param.b_offset);
+    std::swap(blas_param.a_stride, blas_param.b_stride);
+    //std::swap(A_data, B_data);
+    //std::swap(A_data, B_data);
+  }  
 
+  
+  printDsaXBLASParam(blas_param);
+  
+  // DMH: fix me
+  blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS;
+  
   // Perform GEMM accoring to back end configuration
   dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
+
+  //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i);
   
   // reorder output data
   reorderOutput(d);
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
index 17a2c9b..597cfbd 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/dsaX_cublas_interface.cu
@@ -15,38 +15,13 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
   cudaStream_t stream = NULL;
   cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
   cublasCreate(&cublasH);
-  cublasSetStream(cublasH, stream);
+  cublasSetStream(cublasH, stream);  
 
-  // Transfer params
-  cublasOperation_t transa;
-  cublasOperation_t transb;
-  switch (blas_param.trans_a) {
-  case DSA_BLAS_OP_N:
-    transa = CUBLAS_OP_N; break;
-  case DSA_BLAS_OP_T:
-    transa = CUBLAS_OP_T; break;
-  case DSA_BLAS_OP_C:
-    transa = CUBLAS_OP_C; break;
-  default:
-    std::cout << "Unknown cublas transpose" << std::endl;
-  }
-
-  switch (blas_param.trans_b) {
-  case DSA_BLAS_OP_N:
-    transb = CUBLAS_OP_N; break;
-  case DSA_BLAS_OP_T:
-    transb = CUBLAS_OP_T; break;
-  case DSA_BLAS_OP_C:
-    transb = CUBLAS_OP_C; break;
-  default:
-    std::cout << "Unknown cublas transpose" << std::endl;
-  }
-  
+  // Transfer params  
   const int m = blas_param.m;
   const int n = blas_param.n;
   const int k = blas_param.k;
-  const half alpha = blas_param.alpha.real();
-  const half malpha = (-1.0 * blas_param.alpha.real());
+  const double alpha = blas_param.alpha.real();
   const int lda = blas_param.lda;
   const int ldb = blas_param.ldb;
   const half beta0 = blas_param.beta.real();
@@ -59,7 +34,70 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
   const long long int strideB = blas_param.b_stride;
   const long long int strideC = blas_param.c_stride;
   const int batchCount = blas_param.batch_count;
-  
+
+  // NOTE: cublasHgemm is a real valued kernel. As a result,
+  // matrix conjugates must be handled by passing negative
+  // alpha values on the appropriate imaginary planar
+  // arrays. We discern these negative values while parsing
+  // transpose, adjoint and conjugation values.
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int A_imag_alpha_sign = 1.0;
+  switch (blas_param.trans_a) {
+  case DSA_BLAS_OP_N:
+    transa = CUBLAS_OP_N;
+    break;
+  case DSA_BLAS_OP_T:
+    transa = CUBLAS_OP_T;
+    break;
+  case DSA_BLAS_OP_A:
+    transa = CUBLAS_OP_N; 	
+    // A array requests adjoint, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of A.
+    A_imag_alpha_sign *= -1;
+    break;
+  case DSA_BLAS_OP_C:
+    transa = CUBLAS_OP_T; 
+    // A array requests conjugation, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of A.
+    A_imag_alpha_sign *= -1;
+    break;
+  default:
+    std::cout << "Unknown cublas transpose" << std::endl;
+  }
+
+  int B_imag_alpha_sign = alpha;
+    switch (blas_param.trans_b) {
+  case DSA_BLAS_OP_N:
+    transb = CUBLAS_OP_N;
+    break;
+  case DSA_BLAS_OP_T:
+    transb = CUBLAS_OP_T;
+    break;
+  case DSA_BLAS_OP_A:
+    transb = CUBLAS_OP_N; 	
+    // B array requests adjoint, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of B.
+    B_imag_alpha_sign *= -1;
+    break;
+  case DSA_BLAS_OP_C:
+    transb = CUBLAS_OP_T; 
+    // A array requests conjugation, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of A.
+    B_imag_alpha_sign *= -1;
+    break;
+  default:
+    std::cout << "Unknown dsaBLAS transpose" << std::endl;
+  }
+
   // Run strided batched gemm for datatype 
   // (a + ib)(c + id) = (ac - bd) + i(bc + ad)
   // on matrices alpha * op(A) * op(B) + beta * C
@@ -68,25 +106,29 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
   
   // Accumulate results into C matrix
   // ac
-  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+  half alpha_ac = alpha;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ac),
 			    (half *)real_a + a_offset, lda, strideA,
 			    (half *)real_b + b_offset, ldb, strideB, &beta0,
 			    (half *)real_c + c_offset, ldc, strideC,
 			    batchCount);
-  // -bd
-  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &malpha,
+  // -bd (minus sign from i*i)
+  half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign);
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd),
 			    (half*)imag_a + a_offset, lda, strideA,
 			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
 			    (half*)real_c + c_offset, ldc, strideC,
 			    batchCount);
   // bc
-  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+  half alpha_bc = alpha * A_imag_alpha_sign;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc),
 			    (half*)imag_a + a_offset, lda, strideA,
 			    (half*)real_b + b_offset, ldb, strideB, &beta0,
 			    (half*)imag_c + c_offset, ldc, strideC,
 			    batchCount);
   // ad
-  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &alpha,
+  half alpha_ad = alpha * B_imag_alpha_sign;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad),
 			    (half*)real_a + a_offset, lda, strideA,
 			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
 			    (half*)imag_c + c_offset, ldc, strideC,
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index 8eda8ae..1782752 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -7,6 +7,10 @@
 
 using namespace std;
 
+void dsaXInitCuda(int dev){
+  cudaSetDevice(dev);
+}
+
 // allocate device memory
 void initializeCudaMemory(dmem *d, int bf) {
   
@@ -93,10 +97,11 @@ void deallocateCudaMemory(dmem *d, int bf) {
 void reorderOutputCuda(dmem * d) {
   
   // transpose input data
+#if defined (OLD_BLAS)
   dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32);
   transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outr, (half*)d->d_tx_outr);
   transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outi, (half*)d->d_tx_outi);
-  
+#endif  
   // look at output
   /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
   cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
@@ -144,10 +149,17 @@ void reorderOutputCuda(dmem * d) {
       ii++;
     }
   }
-  cudaMemcpy(d_idxs,h_idxs,sizeof(int)*NBASE,cudaMemcpyHostToDevice);
+  cudaMemcpy(d_idxs, h_idxs, sizeof(int)*NBASE,cudaMemcpyHostToDevice);
 
   // run kernel to finish things
-  corr_output_copy<<<NCHAN_PER_PACKET*2*NBASE/128,128>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs);
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NCHAN_PER_PACKET*2*NBASE/blockDim;
+#if defined (OLD_BLAS)
+  corr_output_copy<<<blocks, blockDim>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs);
+#else
+  corr_output_copy<<<blocks, blockDim>>>((half*)d->d_outr, (half*)d->d_outi, d->d_output, d_idxs);
+#endif
   
   /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
   cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
@@ -172,9 +184,17 @@ void reorderOutputCuda(dmem * d) {
 void reorderInputCuda(dmem *d) {
   
   // transpose input data
+#if defined (OLD_BLAS)  
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix<<<dimGrid, dimBlock>>>(d->d_input, d->d_tx);
-  corr_input_copy<<<NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128, 128>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
+
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
+  transpose_matrix_char<<<dimGrid, dimBlock>>>(d->d_input, d->d_tx);
+  corr_input_copy<<<blocks, blockDim>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
+#else
+  corr_input_copy<<<blocks, blockDim>>>(d->d_input, (half*)d->d_r, (half*)d->d_i);
+#endif
 }
 
 
diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp
index c0c461c..f17a6d8 100644
--- a/src/dsaX_interface.cpp
+++ b/src/dsaX_interface.cpp
@@ -3,11 +3,56 @@
 #include <cstring>
 
 #include "dsaX_cuda_interface.h"
+#include "dsaX_utils.h"
 #include "dsaX_ftd.h"
 
 using namespace std;
 
+void printDsaXBLASParam(const dsaXBLASParam param) {
+
+  cout << "struct_size = " << param.struct_size << endl;
+  cout << "blas_type = " << param.blas_type << endl;
+  cout << "blas_lib = " << param.blas_lib << endl;
+  cout << "data_order = " << param.data_order << endl;
+  cout << "trans_a = " << param.trans_a << endl;
+  cout << "trans_b = " << param.trans_b << endl;
+  cout << "m = " << param.m << endl;
+  cout << "n = " << param.n << endl;
+  cout << "k = " << param.k << endl;
+  cout << "lda = " << param.lda << endl;
+  cout << "ldb = " << param.ldb << endl;
+  cout << "ldc = " << param.ldc << endl;
+  cout << "a_offset = " << param.a_offset << endl;
+  cout << "b_offset = " << param.b_offset << endl;
+  cout << "c_offset = " << param.c_offset << endl;
+  cout << "a_stride = " << param.a_stride << endl;
+  cout << "b_stride = " << param.b_stride << endl;
+  cout << "c_stride = " << param.c_stride << endl;
+  cout << "alpha = " << param.alpha << endl;
+  cout << "bets = " << param.alpha << endl;
+  cout << "batch_count = " << param.batch_count << endl;  
+}
+
+void dsaXInit(int dev){
+#if DSA_XENGINE_TARGET_CUDA
+  dsaXInitCuda(dev);
+#endif
+}
+
+void inspectPackedData(char input, int i, bool non_zeros) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+  if(non_zeros) {
+    if(re != 0 || im != 0) 
+      std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  } else {
+    std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  }
+}
+
 void dsaXCorrelator(void *output_data, void *input_data) {  
+
   dmem d;
   int bf = 0;
 #if DSA_XENGINE_TARGET_CUDA
@@ -15,6 +60,7 @@ void dsaXCorrelator(void *output_data, void *input_data) {
   d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   dcorrelator(&d);
+  dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);  
 #else
   std::cout << "dsaX error: not implemented" << std::endl;
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9320850..3722671 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -2,5 +2,7 @@
 include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CLI11_SOURCE_DIR}/include/CLI)
 
+add_library(dsaX_tests command_line_params.cpp)
+
 add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
-target_link_libraries(dsaX_correlator_test dsax)
+target_link_libraries(dsaX_correlator_test dsax dsaX_tests)
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
index c067ced..fa48729 100644
--- a/tests/command_line_params.cpp
+++ b/tests/command_line_params.cpp
@@ -1,17 +1,44 @@
 #include <command_line_params.h>
 
-void usage() {
-  fprintf (stdout,
-	   "dsaX_beamformer_correlator [options]\n"
-	   " -c core   bind process to CPU core [no default]\n"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default REORDER_BLOCK_KEY]\n"
-	   " -o out_key [default XGPU_BLOCK_KEY]\n"
-	   " -b run beamformer [default is to run correlator]\n"
-	   " -h print usage\n"
-	   " -t binary file for test mode\n"
-	   " -f flagants file\n"
-	   " -a calib file\n"
-	   " -s start frequency (assumes -0.244140625MHz BW)\n");
-}
+// General 
+int core = 0;
+bool debug = false;
+
+// Data block HDU keys 
+key_t in_key = REORDER_BLOCK_KEY;
+key_t out_key = XGPU_BLOCK_KEY;
+
+// Test mode
+bool run_beamformer = false;
+bool run_correlator = false;
+double start_frequency = 1498.75;
+
+// Test file
+std::string test_filename;
+int n_channels = 384;
+int n_antennae = 63;
+int n_pol = 2;
+int n_times = 30720;
+
+std::shared_ptr<dsaXApp> make_app(std::string app_description, std::string app_name) {
 
+  auto dsaX_app = std::make_shared<dsaXApp>(app_description, app_name);
+  dsaX_app->option_defaults()->always_capture_default();
+
+  dsaX_app->add_option("--core", core, "Bind process to this CPU core [default 0]");
+  dsaX_app->add_option("--debug", debug, "Send debug messages to syslog");
+  dsaX_app->add_option("--in-key", in_key, "[default REORDER_BLOCK_KEY]");
+  dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]");
+  dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]");
+  dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]");
+  dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)");
+
+  // Input file options
+  dsaX_app->add_option("--test-filename", test_filename, "Name of file on which to run tests");
+  dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]");
+  dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]");
+  dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]");
+  dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]");
+
+  return dsaX_app;
+}
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index b705975..966c269 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -7,140 +7,94 @@
 #include <string.h>
 #include <syslog.h>
 
+// Include this file to access input parameters
+#include "command_line_params.h"
+
 // Include the dsaX.h header in your application
 #include <dsaX.h>
 
 using namespace std;
 
-void usage() {
-  fprintf (stdout,
-	   "dsaX_beamformer_correlator [options]\n"
-	   " -c if dsaX is CUDA enabled, use this GPU"
-	   " -d send debug messages to syslog\n"
-	   " -i in_key [default REORDER_BLOCK_KEY]\n"
-	   " -o out_key [default XGPU_BLOCK_KEY]\n"
-	   " -h print usage\n"
-	   " -t binary file for test mode\n"
-	   " -f flagants file\n"
-	   " -a calib file\n"
-	   " -s start frequency (assumes -0.244140625MHz BW)\n");
-}
+// The class offers entire file content read/write in single operation
+class BinaryFileVector : public std::vector<char>
+{
+public:
 
-void inspectPackedData(char input) {
-  
-  std::cout << "vals = (" << (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4) << ",";
-  
-  std::cout << (float)((char)((   (unsigned char)(input) & (unsigned char)(240)  )) >> 4) << ")" << std::endl;
-}
+  using std::vector<char>::vector;
+
+  bool loadFromFile(const char *fileName) noexcept
+  {
+    // Try to open a file specified by its name    
+    std::ifstream file(fileName, std::ios::in | std::ios::binary);
+    if (!file.is_open() || file.bad())
+      return false;
+
+    // Clear whitespace removal flag
+    file.unsetf(std::ios::skipws);
+
+    // Determine size of the file
+    file.seekg(0, std::ios_base::end);
+    size_t fileSize = file.tellg();
+    file.seekg(0, std::ios_base::beg);
+
+    // Discard previous vector content
+    resize(0);
+    reserve(0);
+    shrink_to_fit();
+
+    // Order to prealocate memory to avoid unnecessary reallocations due to vector growth
+    reserve(fileSize);
+
+    // Read entire file content into prealocated vector memory
+    insert(begin(),
+	   std::istream_iterator<char>(file),
+	   std::istream_iterator<char>());
+
+    // Make sure entire content is loaded
+    if(size() == fileSize) {
+      std::cout << "Successfully read file of size " << fileSize << std::endl;
+      return true;
+    } else {
+      std::cout << "Unexpected file size." << std::endl;
+      return false;
+    }
+  }
+
+  bool saveToFile(const char *fileName) const noexcept
+  {
+    // Write entire vector content into a file specified by its name
+    std::ofstream file(fileName, std::ios::out | std::ios::binary);
+    try {
+      file.write((const char *) data(), size());
+    }
+    catch (...) {
+      return false;
+    }
+
+    // Determine number of bytes successfully stored in file
+    size_t fileSize = file.tellp();
+    if(size() == fileSize) {
+      std::cout << "Successfully wrote file of size " << fileSize  << std::endl;
+      return true;
+    } else {
+      std::cout << "Unexpected file size." << std::endl;
+      return false;
+    }
+  }
+};
 
 int main(int argc, char **argv) {
 
-  // data block HDU keys
-  key_t in_key = REORDER_BLOCK_KEY;
-  key_t out_key = XGPU_BLOCK_KEY;
+  // Parse command line
+  auto app = make_app();  
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
   
   // command line arguments
   int device_ordinal = 0;
-  int arg = 0;
-  int bf = 0;
-  char ftest[200], fflagants[200], fcalib[200];
-  float sfreq = 1498.75;
-  
-  while ((arg=getopt(argc,argv,"c:i:o:t:f:a:s:bdh")) != -1) {
-    switch (arg) {
-    case 'c':
-      if (optarg) {
-	device_ordinal = atoi(optarg);
-	break;
-      }
-      else {
-	syslog(LOG_ERR,"-c flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 'i':
-      if (optarg) {
-	if (sscanf (optarg, "%x", &in_key) != 1) {
-	  syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-	  return EXIT_FAILURE;
-	}
-	break;
-      } else {
-	syslog(LOG_ERR,"-i flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 'o':
-      if (optarg) {
-	if (sscanf (optarg, "%x", &out_key) != 1) {
-	  syslog(LOG_ERR, "could not parse key from %s\n", optarg);
-	  return EXIT_FAILURE;
-	}
-	break;
-      } else {
-	syslog(LOG_ERR,"-o flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 't':
-      if (optarg) {
-	syslog(LOG_INFO, "test mode");
-	if (sscanf (optarg, "%s", &ftest) != 1) {
-	  syslog(LOG_ERR, "could not read test file name from %s\n", optarg);
-	  return EXIT_FAILURE;
-	}
-	break;
-      } else {
-	syslog(LOG_ERR,"-t flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 'a':
-      if (optarg) {
-	syslog(LOG_INFO, "read calib file %s",optarg);
-	if (sscanf (optarg, "%s", &fcalib) != 1) {
-	  syslog(LOG_ERR, "could not read calib file name from %s\n", optarg);
-	  return EXIT_FAILURE;
-	}
-	break;
-      }
-      else {
-	syslog(LOG_ERR,"-a flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 'f':
-      if (optarg) {
-	syslog(LOG_INFO, "reading flag ants file %s",optarg);
-	if (sscanf (optarg, "%s", &fflagants) != 1) {
-	  syslog(LOG_ERR, "could not read flagants file name from %s\n", optarg);
-	  return EXIT_FAILURE;
-	}
-	break;
-      } else {
-	syslog(LOG_ERR,"-f flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 's':
-      if (optarg) {
-	sfreq = atof(optarg);
-	syslog(LOG_INFO, "start freq %g",sfreq);
-	break;
-      }
-      else {
-	syslog(LOG_ERR,"-s flag requires argument");
-	usage();
-	return EXIT_FAILURE;
-      }
-    case 'd':
-      syslog (LOG_DEBUG, "Will excrete all debug messages");
-      break;
-    case 'h':
-      usage();
-      return EXIT_SUCCESS;
-    }
-  }
   
   std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl;
   std::cout << "NCHAN = " << NCHAN << std::endl;
@@ -159,35 +113,104 @@ int main(int argc, char **argv) {
   
   //dsaX_init();  
   FILE *fin, *fout;
-  std::cout << "Creating float output_array of size " << sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl;
-  uint64_t output_size = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*4;
-  std::cout << "Creating char input_array of size " << sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 << std::endl;
-  uint64_t input_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
+  uint64_t sz, output_size, in_block_size, rd_size;
+  in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
+  char * output_data, * o1;
+  int nreps = 1, nchunks = 1;
 
-  float *output_data = (float *)malloc(output_size);
-  char *input_data = (char *)malloc(input_size);
+  // read one block of input data  
+  // get size of file
+  std::cout << "attempting to read file " << test_filename.c_str() << std::endl; 
+  fin=fopen(test_filename.c_str(), "rb");
+  fseek(fin, 0L, SEEK_END);
+  sz = ftell(fin);
+  rewind(fin);
+
+  // figure out how many reps and chunks to read with
+  if (sz > in_block_size) {
+    nreps = (int)(sz/in_block_size);
+    rd_size = in_block_size;
+  }
+  else {
+    nchunks = (int)(in_block_size/sz);
+    rd_size = sz;
+  }
+
+  std::cout << "Creating char input_array of size " << sizeof(char)*in_block_size << std::endl;
+  char *input_data = (char *)malloc(in_block_size);
+
+  // Loop over reps and chunks
+  for (int reps = 0; reps<nreps; reps++) {
+    for (int chunks = 0; chunks<nchunks; chunks++) {
+
+      // Read input file
+      if (chunks>0) rewind(fin);
+      fread(input_data + chunks*rd_size, rd_size, 1, fin);
+
+      std::cout << "Input peek " << std::endl;
+      //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);
+
+      std::cout << "Creating char output_array of size " << sizeof(char)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl;
+      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+      output_data = (char *)malloc(output_size);
+      
+      // run correlator and record output data
+      syslog(LOG_INFO,"run correlator");
+      dsaXCorrelator((void*)output_data, (void*)input_data);
+      
+      std::cout << "Output peek " << std::endl;
+      for(int i=0; i<output_size; i++) inspectPackedData(output_data[i], i, true);
+
+      fout = fopen("output.dat","ab");
+      fwrite((unsigned char *)output_data,sizeof(unsigned char *),output_size,fout);
+      fclose(fout);
+      exit(0);
+    }
+  }
+
+  /*
   
-  // read one block of input data    
+  // Read data
+  BinaryFileVector binaryFileVector;
+
+  
+  if (!binaryFileVector.loadFromFile(test_filename.c_str())) {
+    std::cout << "Failed to read the file." << std::endl;
+    return 0;
+  }
+  
+  // read one block of input data
   for (int i=0;i<512;i++) {
-    fin = fopen(ftest,"rb");
-    fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin);
-    fclose(fin);
+    //fin = fopen(test_filename,"rb");
+    //fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin);
+    //fclose(fin);
   }
 
+  for (int i=0;i<512;i++) {
+    memcpy(input_data + i*binaryFileVector.size(), binaryFileVector.data(), binaryFileVector.size());
+  }
+  
   // Peek at input data (delete after development is complete)
-  //for (int i=0; i<input_size; i++) inspectPackedData(input_data[i]);
+  for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);
   
-  // run correlator and record output data
-  syslog(LOG_INFO,"run correlator");
-  dsaXCorrelator((void*)output_data, (void*)input_data);
 
   // Peek at output data (delete after development is complete)
-  //for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
+  for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) std::cout << "output " << i << " = " << output_data[i] << std::endl;
+  //for (int i=0; i<8; i++) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
+
+  if (!binaryFileVector.saveToFile("output.dat")) {
+    std::cout << "Failed to write a file." << std::endl;
+    return 0;
+  } else {
+    std::cout << "Successfully wrote file." << std::endl;
+  }
+  
   
   fout = fopen("output.dat","wb");
   fwrite((float *)output_data, sizeof(float), NBASE*NCHAN_PER_PACKET*2*2, fout);
   fclose(fout);
-  
+  */
+    
   // free
   free(input_data);
   free(output_data);

From bd40360b3e23887ed9aff9c556373fa396c3459e Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Thu, 27 Jun 2024 21:56:35 -0700
Subject: [PATCH 24/30] split dmem into dmem_corr and dmem_bf, add metrics
 structure, abstract some kernel indexing in preparation for tuning
 functionality

---
 include/dsaX.h                |  52 ++++++++++--
 include/dsaX_cuda_interface.h |  14 ++--
 include/dsaX_cuda_kernels.h   |  35 +++++---
 include/dsaX_ftd.h            |   2 +-
 include/dsaX_interface.h      |  10 ++-
 src/dsaX_beamformer.cpp       |   2 +-
 src/dsaX_correlator.cpp       |   6 +-
 src/dsaX_cuda_interface.cu    | 147 +++++++++++++++++-----------------
 src/dsaX_interface.cpp        |  16 ++--
 9 files changed, 173 insertions(+), 111 deletions(-)

diff --git a/include/dsaX.h b/include/dsaX.h
index 6083bb2..96a645f 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -40,6 +40,16 @@ typedef struct dsaXBLASParam_s {
   
 } dsaXBLASParam;
 
+// Structure that carries BLAS parameters
+typedef struct dsaXCorrParam_s {  
+  size_t struct_size;        /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaXBLASLib blas_lib;         /**< Which BLAS library to use for BLAS ops */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  
+} dsaXCorrParam;
+
 void printDsaXBLASParam(const dsaXBLASParam param);
 
 // required to prevent overflow in corr matrix multiply
@@ -48,8 +58,32 @@ void printDsaXBLASParam(const dsaXBLASParam param);
 // beam sep
 #define sep 1.0 // arcmin
 
-// define structure that carries around device memory pointers
-typedef struct dmem {
+// Global timing and metrics structure for dsaX 
+typedef struct metrics_s {
+
+  // Mem copy times
+  double mem_copy_time_H2H;
+  double mem_copy_time_H2D;
+  double mem_copy_time_D2H;
+  double mem_copy_time_D2D;
+
+  // Mem copy size
+  double mem_copy_size_H2H;
+  double mem_copy_size_H2D;
+  double mem_copy_size_D2H;
+  double mem_copy_size_D2D;
+
+  // Compute
+  double compute_time;
+  double compute_flops;
+
+  // Initialisation
+  double initialisation_time;
+} metrics;
+  
+// define structure that carries around memory pointers
+// and timer for the correlator
+typedef struct dmem_corr_s {
   
   // initial data and streams
   char *h_input;        // host input pointer
@@ -63,7 +97,13 @@ typedef struct dmem {
   // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
   float *d_output;
   
+} dmem_corr;
+
+typedef struct dmem_bf_s {
+
   // beamformer pointers
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
   char *d_big_input;
   void *d_br, *d_bi; //half
   void *weights_r, *weights_i; //weights: [arm, tactp, b] //half
@@ -78,7 +118,9 @@ typedef struct dmem {
   // timing
   float cp, prep, cubl, outp;
   
-} dmem;
+} dmem_bf;
+
+
 
 void dsaXInit(int device_ordinal = 0);
 
@@ -86,5 +128,5 @@ void inspectPackedData(char input, int i, bool non_zero = false);
 
 void dsaXCorrelator(void *output_data, void *input_data);
 
-void reorderOutput(dmem *d);
-void reorderInput(dmem *d);
+void reorderCorrelatorOutput(dmem_corr *d);
+void reorderCorrelatorInput(dmem_corr *d);
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
index d9f2278..54e2609 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/dsaX_cuda_interface.h
@@ -8,9 +8,13 @@
 
 void dsaXInitCuda(int dev);
 
-void initializeCudaMemory(dmem *d, int bf);
+void initializeCorrCudaMemory(dmem_corr *d);
 
-void deallocateCudaMemory(dmem *d, int bf);
+void initializeBFCudaMemory(dmem_bf *d);
+
+void deallocateCorrCudaMemory(dmem_corr *d);
+
+void deallocateBFCudaMemory(dmem_bf *d);
 
 void dsaXmemsetCuda(void *array, int ch, size_t n);
 
@@ -18,11 +22,11 @@ void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKi
 
 void dsaXDeviceSynchronizeCuda();
 
-void reorderOutputCuda(dmem *d);
+void reorderCorrOutputCuda(dmem_corr *d);
 
-void calcWeightsCuda(dmem *d);
+void reorderCorrInputCuda(dmem_corr *d);
 
-void reorderInputCuda(dmem *d);
+void calcWeightsCuda(dmem_bf *d);
 
 template <typename in_prec, typename out_prec> void transposeMatrixCuda(in_prec *idata, out_prec *odata);
 
diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h
index 7fef077..0c2cb7c 100644
--- a/include/dsaX_cuda_kernels.h
+++ b/include/dsaX_cuda_kernels.h
@@ -97,6 +97,7 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
 
 }
 
+// DMH: TUNABLE
 // transpose kernel
 // assume breakdown into tiles of 32x32, and run with 32x8 threads per block
 // launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
@@ -104,32 +105,40 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
 __global__ void transpose_matrix_char(char * idata, char * odata) {
   
   __shared__ char tile[32][33];
+  //extern __shared__ char tile[];
   
-  int x = blockIdx.x * 32 + threadIdx.x;
-  int y = blockIdx.y * 32 + threadIdx.y;
-  int width = gridDim.x * 32;
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.x + threadIdx.y;
+  int width = gridDim.x * blockDim.x;
 
-  for (int j = 0; j < 32; j += 8) {
+  for (int j = 0; j < blockDim.x; j += blockDim.y) {
     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //tile[(threadIdx.y+j)*blockDim.x + threadIdx.x] = idata[(y+j)*width + x];
     //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
   }
   
   __syncthreads();
 
-  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * 32 + threadIdx.y;
-  width = gridDim.y * 32;
+  x = blockIdx.y * blockDim.x + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * blockDim.x + threadIdx.y;
+  width = gridDim.y * blockDim.x;
 
-  for (int j = 0; j < 32; j += 8) {
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+  for (int j = 0; j < blockDim.x; j += blockDim.y) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+    //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)];
   }
 }
 
 
-// kernel to fluff input
-// run with 128 threads and NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/128 blocks
-__global__ void corr_input_copy(char *input, half *inr, half *ini) {
-
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr Half precision real array
+ * @param[out] ini Half precision imag array
+ * @param[in]  input Char precision complex array
+ */
+__global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini) {
+  
   int bidx = blockIdx.x;  
   int tidx = threadIdx.x; 
   int iidx = blockDim.x * bidx + tidx;
diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h
index f7363f1..47b562e 100644
--- a/include/dsaX_ftd.h
+++ b/include/dsaX_ftd.h
@@ -2,4 +2,4 @@
 
 #include "dsaX.h"
 
-void dcorrelator(dmem *d);
+void dcorrelator(dmem_corr *d);
diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h
index 06a2364..18ed9f0 100644
--- a/include/dsaX_interface.h
+++ b/include/dsaX_interface.h
@@ -4,9 +4,15 @@
 
 // DMH: decorate these with Doxygen
 void dsaXCorrelator(void *input_data, void *output_data);
-void reorderInput(dmem *d);
-void reorderOutput(dmem *d);
+
+void reorderCorrInput(dmem_corr *d);
+
+void reorderCorrOutput(dmem_corr *d);
+
 void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+
 void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+
 void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb);
+
 void sumBeam(unsigned char *input, float *output, int blocks, int tpb);
diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp
index f82f677..61fbc5d 100644
--- a/src/dsaX_beamformer.cpp
+++ b/src/dsaX_beamformer.cpp
@@ -29,7 +29,7 @@ using namespace std;
 
 */
 // beamformer function
-void dbeamformer(dmem *d) {
+void dbeamformer(dmem_bf *d) {
 
   dsaXBLASParam blas_param;
   blas_param.trans_a = DSA_BLAS_OP_T;
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
index 7a6882c..2d179d1 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/dsaX_correlator.cpp
@@ -17,7 +17,7 @@ Workflow is similar for BF and corr applications
 // correlator function
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder
 // DMH CUDA references excised.
-void dcorrelator(dmem *d) {
+void dcorrelator(dmem_corr *d) {
   
   // zero out output arrays
   dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
@@ -28,7 +28,7 @@ void dcorrelator(dmem *d) {
   dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
   
   // reorder input into real and imaginary arrays of 2 byte data
-  reorderInput(d);
+  reorderCorrInput(d);
   
   dsaXBLASParam blas_param;
   blas_param.struct_size = sizeof(blas_param);
@@ -105,5 +105,5 @@ void dcorrelator(dmem *d) {
   //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i);
   
   // reorder output data
-  reorderOutput(d);
+  reorderCorrOutput(d);
 }
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index 1782752..0046b1e 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -12,89 +12,90 @@ void dsaXInitCuda(int dev){
 }
 
 // allocate device memory
-void initializeCudaMemory(dmem *d, int bf) {
+void initializeCorrCudaMemory(dmem_corr *d) {
   
   // for correlator
-  if (bf==0) {
-    cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-    cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-    cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-    cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-  }
+  cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+  cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
+  cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+}
 
+void initializeBFCudaMemory(dmem_bf *d) {
+  
   // for beamformer
-  if (bf==1) {
-    cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-    cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-    cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-    cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-    cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
-    cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
-    cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
-
-    // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
-    d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
-    d->flagants = (int *)malloc(sizeof(int)*NANTS);
-    d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
-    cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
-
-    // timers
-    d->cp = 0.;
-    d->prep = 0.;
-    d->outp = 0.;
-    d->cubl = 0.;
-    
-  }  
+  cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
+  cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+  cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
+  cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+  cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
+  cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+  cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
+  cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
+  cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
+  cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
+  
+  // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
+  d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
+  d->flagants = (int *)malloc(sizeof(int)*NANTS);
+  d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
+  cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
+  
+  // timers
+  d->cp = 0.;
+  d->prep = 0.;
+  d->outp = 0.;
+  d->cubl = 0.;
 }
+
 // deallocate device memory
-void deallocateCudaMemory(dmem *d, int bf) {
+void deallocateCorrCudaMemory(dmem_corr *d) {
   
   cudaFree(d->d_input);
-
-  if (bf==0) {
-    cudaFree(d->d_r);
-    cudaFree(d->d_i);
-    cudaFree(d->d_tx);
-    cudaFree(d->d_output);
-    cudaFree(d->d_outr);
-    cudaFree(d->d_outi);
-    cudaFree(d->d_tx_outr);
-    cudaFree(d->d_tx_outi);
-  }
-  if (bf==1) {
-    cudaFree(d->d_tx);
-    cudaFree(d->d_br);
-    cudaFree(d->d_bi);
-    cudaFree(d->weights_r);
-    cudaFree(d->weights_i);
-    cudaFree(d->d_bigbeam_r);
-    cudaFree(d->d_bigbeam_i);
-    cudaFree(d->d_bigpower);
-    cudaFree(d->d_scf);
-    cudaFree(d->d_chscf);
-    free(d->h_winp);
-    free(d->flagants);
-    cudaFree(d->d_freqs);
-    free(d->h_freqs);
-  }  
+  cudaFree(d->d_r);
+  cudaFree(d->d_i);
+  cudaFree(d->d_tx);
+  cudaFree(d->d_output);
+  cudaFree(d->d_outr);
+  cudaFree(d->d_outi);
+  cudaFree(d->d_tx_outr);
+  cudaFree(d->d_tx_outi);
 }
 
+// deallocate device memory
+void deallocateBFCudaMemory(dmem_bf *d) {
+
+  cudaFree(d->d_input);
+  cudaFree(d->d_tx);
+  cudaFree(d->d_br);
+  cudaFree(d->d_bi);
+  cudaFree(d->weights_r);
+  cudaFree(d->weights_i);
+  cudaFree(d->d_bigbeam_r);
+  cudaFree(d->d_bigbeam_i);
+  cudaFree(d->d_bigpower);
+  cudaFree(d->d_scf);
+  cudaFree(d->d_chscf);
+  free(d->h_winp);
+  free(d->flagants);
+  cudaFree(d->d_freqs);
+  free(d->h_freqs);
+}  
+
+
 // function to copy d_outr and d_outi to d_output
 // inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
 // the corr matrices are column major order
 // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
 // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
-void reorderOutputCuda(dmem * d) {
+void reorderCorrOutputCuda(dmem_corr * d) {
   
   // transpose input data
 #if defined (OLD_BLAS)
@@ -181,19 +182,19 @@ void reorderOutputCuda(dmem * d) {
 // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
 // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
 // then fluffs using simple kernel
-void reorderInputCuda(dmem *d) {
+void reorderCorrInputCuda(dmem_corr *d) {
   
   // transpose input data
 #if defined (OLD_BLAS)  
-  dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
+  dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
 
   // TUNABLE
   int blockDim = 128;
   int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
   transpose_matrix_char<<<dimGrid, dimBlock>>>(d->d_input, d->d_tx);
-  corr_input_copy<<<blocks, blockDim>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
 #else
-  corr_input_copy<<<blocks, blockDim>>>(d->d_input, (half*)d->d_r, (half*)d->d_i);
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim>>>(d->d_input, (half*)d->d_r, (half*)d->d_i);
 #endif
 }
 
@@ -222,7 +223,7 @@ void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int>
 // sequential pairs of eastings and northings
 // then [NANTS, 48, R/I] calibs
 
-void calcWeightsCuda(dmem *d) {
+void calcWeightsCuda(dmem_bf *d) {
 
   // allocate
   float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp
index f17a6d8..0c88ee0 100644
--- a/src/dsaX_interface.cpp
+++ b/src/dsaX_interface.cpp
@@ -53,30 +53,30 @@ void inspectPackedData(char input, int i, bool non_zeros) {
 
 void dsaXCorrelator(void *output_data, void *input_data) {  
 
-  dmem d;
-  int bf = 0;
+  dmem_corr d;
 #if DSA_XENGINE_TARGET_CUDA
-  initializeCudaMemory(&d, bf);
+  initializeCorrCudaMemory(&d);
   d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   dcorrelator(&d);
-  dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);  
+  dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);
+  deallocateCorrCudaMemory(&d);
 #else
   std::cout << "dsaX error: not implemented" << std::endl;
 #endif
 }
 
-void reorderInput(dmem *d) {
+void reorderCorrInput(dmem_corr *d) {
 #if DSA_XENGINE_TARGET_CUDA
-  reorderInputCuda(d);
+  reorderCorrInputCuda(d);
 #else
   std::cout << "dsaX error: not implemented" << std::endl;
 #endif
 }
 
-void reorderOutput(dmem *d) {
+void reorderCorrOutput(dmem_corr *d) {
 #if DSA_XENGINE_TARGET_CUDA  
-  reorderOutputCuda(d);
+  reorderCorrOutputCuda(d);
 #else
   std::cout << "dsaX error: not implemented" << std::endl;
 #endif

From a7ce185dfae584eeb30d9c1d33d58b3ad4692dd8 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 28 Jun 2024 22:19:05 -0700
Subject: [PATCH 25/30] Created a Correlator class to allow for persistent
 memory, added missing CLI files, added timer dependency, cleaned up header
 tree, added enhanced parameter handling, cleaner test script

---
 CMakeLists.txt                 |  97 ++++++++++++++++++--------
 include/dsaX.h                 | 123 +++------------------------------
 include/dsaX_enums.h           |   9 ++-
 include/dsaX_ftd.h             |  76 +++++++++++++++++++-
 include/dsaX_interface.h       |   1 +
 include/dsaX_params.h          |  83 ++++++++++++++++++++++
 include/dsaX_utils.h           |   2 +-
 src/CMakeLists.txt             |  23 +++---
 src/dsaX_correlator.cpp        | 112 ++++++++++++++++++++++++++++--
 src/dsaX_cublas_interface.cu   |   2 +
 src/dsaX_cuda_interface.cu     |   8 +--
 src/dsaX_interface.cpp         |  43 +++++-------
 src/dsaX_magma_interface.cu    |   2 +
 src/dsaX_params.cpp            | 102 +++++++++++++++++++++++++++
 src/dsaX_utils.cpp             |  25 ++++++-
 tests/CMakeLists.txt           |   2 +-
 tests/command_line_params.cpp  |   6 +-
 tests/command_line_params.h    |  35 ++++++++++
 tests/dsaX_correlator_test.cpp |  85 ++++++++++++-----------
 19 files changed, 595 insertions(+), 241 deletions(-)
 create mode 100644 include/dsaX_params.h
 create mode 100644 src/dsaX_params.cpp
 create mode 100644 tests/command_line_params.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3cf1b0..a5a2333 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,7 +48,6 @@ if(TARGET_TYPE_VALID LESS 0)
   message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
 endif()
 
-
 # Git helpers
 #------------
 find_package(Git)
@@ -231,7 +230,7 @@ if(DSA_XENGINE_ENABLE_OPENBLAS)
 endif()
 
 # Get psrdada dependency
-option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for correlatorss" ON)
+option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for IO" ON)
 option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
 if(DSA_XENGINE_DOWNLOAD_PSRDADA)
   # Download, build and install
@@ -245,40 +244,80 @@ else()
   find_package(PSRDada REQUIRED)
 endif()
 
+# Get HDF5 dependency
+option(DSA_XENGINE_ENABLE_HDF5 "Use HDF5 for data IO" OFF)
+if(DSA_XENGINE_ENABLE_HDF5)
+  option(DSA_XENGINE_DOWNLOAD_HDF5 "Download and build HDf5" OFF)
+  if(DSA_XENGINE_DOWNLOAD_HDF5)
+    # Download, build and install
+    FetchContent_Declare(
+      HDF5
+      GIT_REPOSITORY https://github.com/HDFGroup/hdf5.git
+      GIT_TAG 5794814
+      )
+    FetchContent_MakeAvailable(HDF5)
+  else()
+    # Find and link to local install
+    find_package(HDF5 REQUIRED)
+  endif()
+endif()
+
 # Get CLI11 dependency
 # FIX ME: get static .hpp version and ship with package
 option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON)
-option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON)
-if(DSA_XENGINE_DOWNLOAD_CLI11)
-  # Download, build and install
-  FetchContent_Declare(
-    CLI11
-    GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
-    GIT_TAG main
-    )
-  FetchContent_MakeAvailable(CLI11)
-else()
-  # Find and link to local install
-  find_package(CLI11 REQUIRED)
+if(DSA_XENGINE_ENABLE_CLI11)
+  option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON)
+  if(DSA_XENGINE_DOWNLOAD_CLI11)
+    # Download, build and install
+    FetchContent_Declare(
+      CLI11
+      GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+      GIT_TAG main
+      )
+    FetchContent_MakeAvailable(CLI11)
+  else()
+    # Find and link to local install
+    find_package(CLI11 REQUIRED)
+  endif()
 endif()
 
-# Get CLI11 dependency
-# FIX ME: get static .hpp version and ship with package
-option(DSA_XENGINE_ENABLE_GOOGLETEST "Enable GOOGLETEST (required)" ON)
-option(DSA_XENGINE_DOWNLOAD_GOOGLETEST "Download and build GOOGLETEST" ON)
-if(DSA_XENGINE_DOWNLOAD_GOOGLETEST)
-  # Download, build and install
-  FetchContent_Declare(
-    GOOGLETEST
-    GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG main
-    )
-  FetchContent_MakeAvailable(GOOGLETEST)
-else()
-  # Find and link to local install
-  find_package(GOOGLETEST REQUIRED)
+
+# Get ZFP dependency
+option(DSA_XENGINE_ENABLE_ZFP "Enable ZFP" OFF)
+if(DSA_XENGINE_ENABLE_ZFP)
+  option(DSA_XENGINE_DOWNLOAD_ZFP "Download and build ZFP" OFF)
+  if(DSA_XENGINE_DOWNLOAD_ZFP)
+    # Download, build and install
+    FetchContent_Declare(
+      ZFP
+      GIT_REPOSITORY https://github.com/LLNL/zfp.git
+      GIT_TAG f40868a
+      )
+    FetchContent_MakeAvailable(ZFP)
+  else()
+    # Find and link to local install
+    find_package(ZFP REQUIRED)
+  endif()
 endif()
 
+# Get Timer dependency https://github.com/cpp-core/timer.git
+# Get timer dependency
+option(DSA_XENGINE_ENABLE_TIMER "Enable timer" ON)
+if(DSA_XENGINE_ENABLE_TIMER)
+  option(DSA_XENGINE_DOWNLOAD_TIMER "Download and build timer" ON)
+  if(DSA_XENGINE_DOWNLOAD_TIMER)
+    # Download, build and install
+    FetchContent_Declare(
+      TIMER
+      GIT_REPOSITORY https://github.com/cpp-core/timer.git
+      GIT_TAG main
+      )
+    FetchContent_MakeAvailable(TIMER)
+  else()
+    # Find and link to local install
+    find_package(TIMER REQUIRED)
+  endif()
+endif()
 
 # Add src, include, tests, and legacy
 add_subdirectory(src)
diff --git a/include/dsaX.h b/include/dsaX.h
index 96a645f..cc3ff5c 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -1,132 +1,25 @@
 #pragma once 
 
-#include <complex>
-
+// Expose the use to compile time definitions,
+// enums, parameters, and classes
 #include "dsaX_def.h"
 #include "dsaX_enums.h"
+#include "dsaX_params.h"
+#include "dsaX_ftd.h"
 
+// Use manual transpose route
+// Uncomment to try new pure cuBLAS
 #define OLD_BLAS
 
-// Structure that carries BLAS parameters
-typedef struct dsaXBLASParam_s {  
-  size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
-  
-  dsaXBLASType blas_type;    /**< Type of BLAS computation to perfrom */
-
-  dsaXBLASLib blas_lib;      /**< Which BLAS library to use for BLAS ops */
-  
-  // GEMM params
-  dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
-  dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
-  int m;                     /**< number of rows of matrix op(A) and C. */
-  int n;                     /**< number of columns of matrix op(B) and C. */
-  int k;                     /**< number of columns of op(A) and rows of op(B). */
-  int lda;                   /**< leading dimension of two-dimensional array used to store the matrix A. */
-  int ldb;                   /**< leading dimension of two-dimensional array used to store matrix B. */
-  int ldc;                   /**< leading dimension of two-dimensional array used to store matrix C. */
-  long long int a_offset;    /**< position of the A array from which begin read/write. */
-  long long int b_offset;    /**< position of the B array from which begin read/write. */
-  long long int c_offset;    /**< position of the C array from which begin read/write. */
-  long long int a_stride;    /**< stride of the A array in strided(batched) mode */
-  long long int b_stride;    /**< stride of the B array in strided(batched) mode */
-  long long int c_stride;    /**< stride of the C array in strided(batched) mode */
-  std::complex<double> alpha;     /**< scalar used for multiplication. */
-  std::complex<double>  beta;     /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
-  
-  // Common params
-  int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
-  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
-  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
-  
-} dsaXBLASParam;
-
-// Structure that carries BLAS parameters
-typedef struct dsaXCorrParam_s {  
-  size_t struct_size;        /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
-  
-  dsaXBLASLib blas_lib;         /**< Which BLAS library to use for BLAS ops */
-  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
-  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
-  
-} dsaXCorrParam;
-
-void printDsaXBLASParam(const dsaXBLASParam param);
-
 // required to prevent overflow in corr matrix multiply
 #define halfFac 4
 
 // beam sep
 #define sep 1.0 // arcmin
 
-// Global timing and metrics structure for dsaX 
-typedef struct metrics_s {
-
-  // Mem copy times
-  double mem_copy_time_H2H;
-  double mem_copy_time_H2D;
-  double mem_copy_time_D2H;
-  double mem_copy_time_D2D;
-
-  // Mem copy size
-  double mem_copy_size_H2H;
-  double mem_copy_size_H2D;
-  double mem_copy_size_D2H;
-  double mem_copy_size_D2D;
-
-  // Compute
-  double compute_time;
-  double compute_flops;
-
-  // Initialisation
-  double initialisation_time;
-} metrics;
-  
-// define structure that carries around memory pointers
-// and timer for the correlator
-typedef struct dmem_corr_s {
-  
-  // initial data and streams
-  char *h_input;        // host input pointer
-  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-  
-  // correlator pointers
-  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times]
-  void *d_r, *d_i; //half
-  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
-  void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half
-  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
-  float *d_output;
-  
-} dmem_corr;
-
-typedef struct dmem_bf_s {
-
-  // beamformer pointers
-  char *h_input;        // host input pointer
-  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
-  char *d_big_input;
-  void *d_br, *d_bi; //half
-  void *weights_r, *weights_i; //weights: [arm, tactp, b] //half
-  void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half
-  unsigned char *d_bigpower; //output: [b, tc]
-  float *d_scf; // scale factor per beam
-  float *d_chscf;
-  float *h_winp;
-  int *flagants, nflags;
-  float *h_freqs, *d_freqs;
-
-  // timing
-  float cp, prep, cubl, outp;
-  
-} dmem_bf;
-
-
-
 void dsaXInit(int device_ordinal = 0);
+void dsaXEnd();
 
 void inspectPackedData(char input, int i, bool non_zero = false);
 
-void dsaXCorrelator(void *output_data, void *input_data);
-
-void reorderCorrelatorOutput(dmem_corr *d);
-void reorderCorrelatorInput(dmem_corr *d);
+void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param);
diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h
index 4e8351f..9bffca0 100644
--- a/include/dsaX_enums.h
+++ b/include/dsaX_enums.h
@@ -27,17 +27,22 @@ typedef enum dsaXBLASLib_s {
   DSA_BLAS_LIB_MAGMA  = 1,
   DSA_BLAS_LIB_CUTLASS = 2,
   DSA_BLAS_LIB_TCC = 3, 
-  DSA_BLAS_LIB_OPENBLAS = 4, 
+  DSA_BLAS_LIB_OPENBLAS = 4,
+  DSA_BLAS_LIB_NATIVE = 5, 
   DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM  
 } dsaXBLASLib;
 
-typedef enum dsaXBLASDataLib_s {
+typedef enum dsaXBLASDataType_s {				
   DSA_BLAS_DATATYPE_H = 0, // Half
   DSA_BLAS_DATATYPE_S = 1, // Single
   DSA_BLAS_DATATYPE_D = 2, // Double
   DSA_BLAS_DATATYPE_HC = 3, // Complex(half)
   DSA_BLAS_DATATYPE_C = 4, // Complex(single)
   DSA_BLAS_DATATYPE_Z = 5, // Complex(double)
+  DSA_BLAS_DATATYPE_4b_REAL = 6, // 4b sized real
+  DSA_BLAS_DATATYPE_2b_REAL = 7, // 2b sized real
+  DSA_BLAS_DATATYPE_4b_COMPLEX = 8, // Char sized complex (4b,4b)
+  DSA_BLAS_DATATYPE_2b_COMPLEX = 9, // 4b sized (2b,2b)  
   DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM
 } dsaXBLASDataType;
 
diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h
index 47b562e..9c35043 100644
--- a/include/dsaX_ftd.h
+++ b/include/dsaX_ftd.h
@@ -1,5 +1,79 @@
 #pragma once
 
-#include "dsaX.h"
+//#include "dsaX_def.h"
+#include "dsaX_enums.h"
+#include "dsaX_params.h"
+
+// define structures that carry around memory pointers
+// and metric.
+// DMH: make a base and inherit into corr and bf
+typedef struct dmem_corr_s {
+  
+  // initial data and streams
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  
+  // correlator pointers
+  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times]
+  void *d_r, *d_i; //half
+  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
+  void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half
+  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+  float *d_output;
+
+  metrics metric_data;
+  
+} dmem_corr;
+
+typedef struct dmem_bf_s {
+
+  // beamformer pointers
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  char *d_big_input;
+  void *d_br, *d_bi; //half
+  void *weights_r, *weights_i; //weights: [arm, tactp, b] //half
+  void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half
+  unsigned char *d_bigpower; //output: [b, tc]
+  float *d_scf; // scale factor per beam
+  float *d_chscf;
+  float *h_winp;
+  int *flagants, nflags;
+  float *h_freqs, *d_freqs;
+
+  // timing (old)
+  float cp, prep, cubl, outp;
+  metrics metric_data;
+  
+} dmem_bf;
 
 void dcorrelator(dmem_corr *d);
+
+class Correlator {
+  
+private:
+protected:
+  
+  dmem_corr d;  
+  dsaXCorrParam corr_param;
+  dsaXBLASParam blas_param;
+  
+public:
+  
+  // Constructor
+  // Initialise device memory if CUDA enabled
+  // make host memory if CPU
+  Correlator(const dsaXCorrParam *corr_param);
+
+  // Compute the FX correlator on input,
+  // place result in output.
+  void compute(void *output, void *input);
+  
+  ~Correlator();  
+};
+
+void destroyDsaXCorrDeviceMemory(dmem_corr *d);
+void initDsaXCorrDeviceMemory(dmem_corr *d);
+
+void reorderCorrelatorOutput(dmem_corr *d);
+void reorderCorrelatorInput(dmem_corr *d);
diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h
index 18ed9f0..a98215e 100644
--- a/include/dsaX_interface.h
+++ b/include/dsaX_interface.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <vector>
+#include "dsaX.h"
 
 // DMH: decorate these with Doxygen
 void dsaXCorrelator(void *input_data, void *output_data);
diff --git a/include/dsaX_params.h b/include/dsaX_params.h
new file mode 100644
index 0000000..bf5f455
--- /dev/null
+++ b/include/dsaX_params.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <complex>
+
+#include "dsaX_enums.h"
+
+// Structure that carries BLAS parameters
+// This should be able to communicate to all
+// backend choices of BLAS library
+typedef struct dsaXBLASParam_s {  
+  size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaXBLASType blas_type;    /**< Type of BLAS computation to perform */
+
+  dsaXBLASLib blas_lib;      /**< Which BLAS library to use for BLAS ops */
+  
+  // GEMM params
+  dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
+  dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
+  int m;                     /**< number of rows of matrix op(A) and C. */
+  int n;                     /**< number of columns of matrix op(B) and C. */
+  int k;                     /**< number of columns of op(A) and rows of op(B). */
+  int lda;                   /**< leading dimension of two-dimensional array used to store the matrix A. */
+  int ldb;                   /**< leading dimension of two-dimensional array used to store matrix B. */
+  int ldc;                   /**< leading dimension of two-dimensional array used to store matrix C. */
+  long long int a_offset;    /**< position of the A array from which begin read/write. */
+  long long int b_offset;    /**< position of the B array from which begin read/write. */
+  long long int c_offset;    /**< position of the C array from which begin read/write. */
+  long long int a_stride;    /**< stride of the A array in strided(batched) mode */
+  long long int b_stride;    /**< stride of the B array in strided(batched) mode */
+  long long int c_stride;    /**< stride of the C array in strided(batched) mode */
+  std::complex<double> alpha;     /**< scalar used for multiplication. */
+  std::complex<double>  beta;     /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
+  
+  // Common params
+  int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  
+} dsaXBLASParam;
+
+// Structure that carries Correlator class parameters
+typedef struct dsaXCorrParam_s {  
+  size_t struct_size;        /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaXBLASLib blas_lib;         /**< Which BLAS library to use for BLAS ops */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  
+} dsaXCorrParam;
+
+// Global timing and metrics structure for dsaX 
+typedef struct metrics_s {
+
+  // Mem copy times
+  double mem_copy_time_H2H;
+  double mem_copy_time_H2D;
+  double mem_copy_time_D2H;
+  double mem_copy_time_D2D;
+
+  // Mem copy size
+  double mem_copy_size_H2H;
+  double mem_copy_size_H2D;
+  double mem_copy_size_D2H;
+  double mem_copy_size_D2D;
+
+  // Compute
+  double compute_time;
+  double compute_flops;
+
+  // Initialisation
+  double initialisation_time;
+} metrics;
+
+// Parameter struct helper functions for user
+const char *getBLASLibString(dsaXBLASLib lib);
+const char *getBLASDataTypeString(dsaXBLASDataType type);
+const char *getBLASDataOrderString(dsaXBLASDataOrder order);
+void printDsaXBLASParam(const dsaXBLASParam param);
+void printDsaXCorrParam(const dsaXCorrParam param);
+
+// Create params
+dsaXCorrParam newDsaXCorrParam(void);
diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h
index f2dbc0c..fa22abe 100644
--- a/include/dsaX_utils.h
+++ b/include/dsaX_utils.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "dsaX.h"
+#include "dsaX_params.h"
 
 void dsaXmemset(void *array, int ch, size_t n);
 void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aaacfa5..f885512 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,6 +19,7 @@ set(DSAX_OBJS
   dsaX_correlator.cpp
   dsaX_interface.cpp
   dsaX_utils.cpp
+  dsaX_params.cpp
   dsaX_psrdada_utils.cpp
   )
 
@@ -47,41 +48,41 @@ if(GITVERSION)
 endif()
 mark_as_advanced(DSAX_GITDIR)
 
-# generate a cmake object library for all cpp files first                                                                                                                                                           
+# generate a cmake object library for all cpp files first
 add_library(dsax_cpp OBJECT ${DSAX_OBJS})
 
 if(DSA_XENGINE_BUILD_SHAREDLIB)
   set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
-  add_library(dsax SHARED)
+  add_library(dsaX SHARED)
 else()
-  add_library(dsax STATIC)
+  add_library(dsaX STATIC)
 endif()
-add_library(DSA_XENGINE::dsax ALIAS dsax)
+add_library(DSA_XENGINE::dsaX ALIAS dsaX)
 
-# make one library                                                                                                                                                                                                  
-target_sources(dsax PRIVATE $<TARGET_OBJECTS:dsax_cpp> ${DSAX_CU_OBJS})
+# make one library
+target_sources(dsaX PRIVATE $<TARGET_OBJECTS:dsax_cpp> ${DSAX_CU_OBJS})
 
 if(CUDAToolkit_FOUND)
-  target_link_libraries(dsax INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY})
+  target_link_libraries(dsaX INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY})
 endif()
 
 if(DSA_XENGINE_ENABLE_PSRDADA)
   include_directories(${PSRDada_SOURCE_DIR}/src)
   set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
-  target_link_libraries(dsax PUBLIC ${PSRDada_LIB})
+  target_link_libraries(dsaX PUBLIC ${PSRDada_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_XGPU) 
   include_directories(${xGPU_SOURCE_DIR}/src)
   set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
-  target_link_libraries(dsax PUBLIC ${XGPU_LIB})
+  target_link_libraries(dsaX PUBLIC ${XGPU_LIB})
 endif()
 
 if(DSA_XENGINE_ENABLE_CUTLASS) 
   include_directories(${NvidiaCutlass_DIR}/../../../include)
   include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
   set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
-  target_link_libraries(dsax PUBLIC ${NvidiaCutlass_LIB})
+  target_link_libraries(dsaX PUBLIC ${NvidiaCutlass_LIB})
   
   # Some simple CUTLASS examples to test linking/benching
   #------------------------------------------------------
@@ -107,7 +108,7 @@ endif()
 #-----------------------------
 install(TARGETS
   # cmake-format: sortable
-  dsax
+  dsaX
   LIBRARY DESTINATION
   lib
   )
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
index 2d179d1..fecc184 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/dsaX_correlator.cpp
@@ -10,15 +10,115 @@ Workflow is similar for BF and corr applications
 
 #include "dsaX_def.h"
 #include "dsaX.h"
+#include "dsaX_ftd.h"
 #include "dsaX_blas_interface.h"
 #include "dsaX_utils.h"
 #include "dsaX_psrdada_utils.h"
 
+Correlator::Correlator(const dsaXCorrParam *param) {
+
+  // Transfer passed param to internal objects
+  corr_param = *param;
+  //printDsaXCorrParam(corr_param);
+
+  // Select back end BLAS engine 
+  blas_param.struct_size = sizeof(blas_param);
+  blas_param.blas_type = DSA_BLAS_GEMM;
+  blas_param.blas_lib = corr_param.blas_lib;
+
+  // Initialise device memeory
+  initDsaXCorrDeviceMemory(&d);
+  
+  // gemm settings
+  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
+#if defined OLD_BLAS
+  //std::cout << "Old params" << std::endl;  
+  blas_param.data_order = DSA_BLAS_DATAORDER_COL;
+  blas_param.trans_a = DSA_BLAS_OP_A;
+  blas_param.trans_b = DSA_BLAS_OP_T;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#else
+  //std::cout << "My params" << std::endl;
+  blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
+  blas_param.trans_a = DSA_BLAS_OP_C;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#endif
+
+  // Swap A and B if in row order
+  if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
+    std::swap(blas_param.m, blas_param.n);
+    std::swap(blas_param.lda, blas_param.ldb);
+    std::swap(blas_param.trans_a, blas_param.trans_b);
+    std::swap(blas_param.a_offset, blas_param.b_offset);
+    std::swap(blas_param.a_stride, blas_param.b_stride);
+    //std::swap(A_data, B_data);
+    //std::swap(A_data, B_data);
+  }  
+}
+
+Correlator::~Correlator() {
+  destroyDsaXCorrDeviceMemory(&d);
+}
+
+void Correlator::compute(void *output, void *input) {
+  
+  // zero out output arrays
+  dsaXmemset(d.d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
+  dsaXmemset(d.d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
+  dsaXmemset(d.d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  
+  // copy to device
+  dsaXmemcpy(d.d_input, input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
+  
+  // reorder input into real and imaginary arrays of half
+  reorderCorrInput(&d);
+    
+  // Perform GEMM accoring to back end configuration
+  dsaXHgemmStridedBatched(d.d_r, d.d_i, d.d_r, d.d_i, d.d_outr, d.d_outi, blas_param);
+
+  // reorder output data
+  reorderCorrOutput(&d);
+  
+  // Pass result back to host
+  dsaXmemcpy(output, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);  
+}
+
+ 
 // correlator function
-// workflow: copy to device, reorder, stridedBatchedGemm, reorder
-// DMH CUDA references excised.
+// workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host
+// DMH: CUDA references excised. Make me a class
 void dcorrelator(dmem_corr *d) {
-  
+    
   // zero out output arrays
   dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
   dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
@@ -39,7 +139,7 @@ void dcorrelator(dmem_corr *d) {
   // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
 
 #if defined OLD_BLAS
-  std::cout << "Old params" << std::endl;
+  //std::cout << "Old params" << std::endl;
   
   blas_param.data_order = DSA_BLAS_DATAORDER_COL;
   blas_param.trans_a = DSA_BLAS_OP_A;
@@ -60,7 +160,7 @@ void dcorrelator(dmem_corr *d) {
   blas_param.b_offset = 0;
   blas_param.c_offset = 0;
 #else
-  std::cout << "My params" << std::endl;
+  //std::cout << "My params" << std::endl;
   
   blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
   blas_param.trans_a = DSA_BLAS_OP_C;
@@ -94,7 +194,7 @@ void dcorrelator(dmem_corr *d) {
   }  
 
   
-  printDsaXBLASParam(blas_param);
+  //printDsaXBLASParam(blas_param);
   
   // DMH: fix me
   blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS;
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
index 597cfbd..0bffaea 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/dsaX_cublas_interface.cu
@@ -1,5 +1,7 @@
 #include <iostream>
+
 #include "dsaX.h"
+#include "dsaX_params.h"
 #include "dsaX_cuda_headers.h"
 
 using namespace std;
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index 0046b1e..ec54675 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -319,6 +319,10 @@ void dsaXmemsetCuda(void *array, int ch, size_t n){
   cudaMemset(array, ch, n);
 }
 
+void dsaXDeviceSynchronizeCuda() {
+  cudaDeviceSynchronize();
+}
+
 void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
   cudaError error = cudaSuccess;
   switch(kind) {
@@ -340,7 +344,3 @@ void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind ki
   if(error != cudaSuccess) cudaGetLastError();
 }
 
-void dsaXDeviceSynchronizeCuda() {
-  cudaDeviceSynchronize();
-}
-
diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp
index 0c88ee0..6358df1 100644
--- a/src/dsaX_interface.cpp
+++ b/src/dsaX_interface.cpp
@@ -1,42 +1,33 @@
 #include <iostream>
 #include <vector>
 #include <cstring>
+#include <string>
 
+#include "dsaX_params.h"
 #include "dsaX_cuda_interface.h"
 #include "dsaX_utils.h"
 #include "dsaX_ftd.h"
 
 using namespace std;
 
-void printDsaXBLASParam(const dsaXBLASParam param) {
-
-  cout << "struct_size = " << param.struct_size << endl;
-  cout << "blas_type = " << param.blas_type << endl;
-  cout << "blas_lib = " << param.blas_lib << endl;
-  cout << "data_order = " << param.data_order << endl;
-  cout << "trans_a = " << param.trans_a << endl;
-  cout << "trans_b = " << param.trans_b << endl;
-  cout << "m = " << param.m << endl;
-  cout << "n = " << param.n << endl;
-  cout << "k = " << param.k << endl;
-  cout << "lda = " << param.lda << endl;
-  cout << "ldb = " << param.ldb << endl;
-  cout << "ldc = " << param.ldc << endl;
-  cout << "a_offset = " << param.a_offset << endl;
-  cout << "b_offset = " << param.b_offset << endl;
-  cout << "c_offset = " << param.c_offset << endl;
-  cout << "a_stride = " << param.a_stride << endl;
-  cout << "b_stride = " << param.b_stride << endl;
-  cout << "c_stride = " << param.c_stride << endl;
-  cout << "alpha = " << param.alpha << endl;
-  cout << "bets = " << param.alpha << endl;
-  cout << "batch_count = " << param.batch_count << endl;  
-}
 
 void dsaXInit(int dev){
 #if DSA_XENGINE_TARGET_CUDA
   dsaXInitCuda(dev);
 #endif
+
+  std::cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl;
+  std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl;
+  std::cout << "NCHAN = " << NCHAN << std::endl;
+  std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl;
+  std::cout << "NPOL = " << NPOL << std::endl;
+  std::cout << "NARM = " << 3 << std::endl;
+  std::cout << " --- End dsaX configuration --- " << endl;
+  //DMH: Add more (ask Vikram)
+}
+
+void dsaXEnd() {
+  // output metrics
 }
 
 void inspectPackedData(char input, int i, bool non_zeros) {
@@ -51,10 +42,10 @@ void inspectPackedData(char input, int i, bool non_zeros) {
   }
 }
 
-void dsaXCorrelator(void *output_data, void *input_data) {  
+void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param) {  
 
   dmem_corr d;
-#if DSA_XENGINE_TARGET_CUDA
+#if DSA_XENGINE_TARGET_CUDA  
   initializeCorrCudaMemory(&d);
   d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
   memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
diff --git a/src/dsaX_magma_interface.cu b/src/dsaX_magma_interface.cu
index 14a8f4f..eabfdbf 100644
--- a/src/dsaX_magma_interface.cu
+++ b/src/dsaX_magma_interface.cu
@@ -1,5 +1,7 @@
 #include <iostream>
+
 #include "dsaX.h"
+#include "dsaX_params.h"
 #include "dsaX_cuda_headers.h"
 #include "dsaX_magma_headers.h"
 
diff --git a/src/dsaX_params.cpp b/src/dsaX_params.cpp
new file mode 100644
index 0000000..7ed0d5b
--- /dev/null
+++ b/src/dsaX_params.cpp
@@ -0,0 +1,102 @@
+#include <iostream>
+
+#include "dsaX_params.h"
+
+using namespace std;
+
+const char *getBLASLibString(dsaXBLASLib lib)
+{
+  const char *ret;
+
+  switch (lib) {
+  case DSA_BLAS_LIB_CUBLAS: ret = "CUBLAS"; break;
+  case DSA_BLAS_LIB_MAGMA: ret = "MAGMA"; break;
+  case DSA_BLAS_LIB_CUTLASS: ret = "CUTLAS"; break;
+  case DSA_BLAS_LIB_OPENBLAS: ret = "OPENBLAS"; break;
+  case DSA_BLAS_LIB_NATIVE: ret = "NATIVE"; break;
+  default: ret = "unknown"; break;
+  }
+  
+  return ret;
+}
+
+const char *getBLASDataTypeString(dsaXBLASDataType type)
+{
+  const char *ret;
+
+  switch (type) {
+  case DSA_BLAS_DATATYPE_H: ret = "Half"; break;
+  case DSA_BLAS_DATATYPE_S: ret = "Single"; break;
+  case DSA_BLAS_DATATYPE_D: ret = "Double"; break;
+  case DSA_BLAS_DATATYPE_HC: ret = "Complex(half)"; break;
+  case DSA_BLAS_DATATYPE_C: ret = "Complex(single)"; break;
+  case DSA_BLAS_DATATYPE_Z: ret = "Complex(double)"; break;
+  case DSA_BLAS_DATATYPE_4b_REAL: ret = "4b sized real"; break;
+  case DSA_BLAS_DATATYPE_2b_REAL: ret = "2b sized real"; break;
+  case DSA_BLAS_DATATYPE_4b_COMPLEX: ret = "Char sized complex (4b,4b)"; break;
+  case DSA_BLAS_DATATYPE_2b_COMPLEX: ret = "4b sized (2b,2b)"; break;  
+  default: ret = "unknown"; break;
+  }
+
+  return ret;
+}
+
+const char *getBLASDataOrderString(dsaXBLASDataOrder order)
+{
+  const char *ret;
+
+  switch (order) {
+  case DSA_BLAS_DATAORDER_ROW: ret = "Row order"; break;
+  case DSA_BLAS_DATAORDER_COL: ret = "Column order"; break;
+  default: ret = "unknown"; break;
+  }
+  
+  return ret;
+}
+
+void printDsaXCorrParam(const dsaXCorrParam param) {
+
+  cout << "--- dsaXCorrParam begin ---" << endl;
+  cout << "struct_size = " << param.struct_size << endl;
+  cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl;
+  cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl;
+  cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl;
+  cout << " --- dsaXCorrParam end ---" << endl;
+}
+
+void printDsaXBLASParam(const dsaXBLASParam param) {
+
+  cout << " --- dsaXBLASParam begin ---" << endl;
+  cout << "struct_size = " << param.struct_size << endl;
+  cout << "blas_type = " << param.blas_type << endl;
+  cout << "blas_lib = " << param.blas_lib << endl;
+  cout << "data_type = " << param.data_type << endl;
+  cout << "data_order = " << param.data_order << endl;
+  cout << "trans_a = " << param.trans_a << endl;
+  cout << "trans_b = " << param.trans_b << endl;
+  cout << "m = " << param.m << endl;
+  cout << "n = " << param.n << endl;
+  cout << "k = " << param.k << endl;
+  cout << "lda = " << param.lda << endl;
+  cout << "ldb = " << param.ldb << endl;
+  cout << "ldc = " << param.ldc << endl;
+  cout << "a_offset = " << param.a_offset << endl;
+  cout << "b_offset = " << param.b_offset << endl;
+  cout << "c_offset = " << param.c_offset << endl;
+  cout << "a_stride = " << param.a_stride << endl;
+  cout << "b_stride = " << param.b_stride << endl;
+  cout << "c_stride = " << param.c_stride << endl;
+  cout << "alpha = " << param.alpha << endl;
+  cout << "beta = " << param.beta << endl;
+  cout << "batch_count = " << param.batch_count << endl;
+  cout << " --- dsaXBLASParam end ---" << endl;
+}
+
+dsaXCorrParam newDsaXCorrParam(void) {
+  dsaXCorrParam new_param;
+  new_param.struct_size = sizeof(new_param);
+  new_param.blas_lib = DSA_BLAS_LIB_INVALID;
+  new_param.data_type = DSA_BLAS_DATATYPE_INVALID;
+  new_param.data_order = DSA_BLAS_DATAORDER_INVALID;
+  return new_param;
+}
diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp
index 54e849a..3819e98 100644
--- a/src/dsaX_utils.cpp
+++ b/src/dsaX_utils.cpp
@@ -1,7 +1,12 @@
+#include <iostream>
+
 #include "dsaX_utils.h"
 #include "dsaX_enums.h"
+#include "dsaX_params.h"
 #include "dsaX_cuda_interface.h"
 
+using namespace std;
+
 void dsaXmemset(void *array, int ch, size_t n){
 #ifdef DSA_XENGINE_TARGET_CUDA
   dsaXmemsetCuda(array, ch, n);
@@ -21,9 +26,27 @@ void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
 
 void dsaXDeviceSynchronize() {
 #ifdef DSA_XENGINE_TARGET_CUDA
-  // Perform host to device memcopy on data
+  // Synchronise the device
   dsaXDeviceSynchronizeCuda();
 #else  
   // NO OP
 #endif
 }
+
+void initDsaXCorrDeviceMemory(dmem_corr *d) {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  initializeCorrCudaMemory(d);
+#else  
+  cout << "dsaX Error: Not implemented." << endl;
+  exit(0);
+#endif  
+}
+
+void destroyDsaXCorrDeviceMemory(dmem_corr *d) {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  deallocateCorrCudaMemory(d);
+#else  
+  cout << "dsaX Error: Not implemented." << endl;
+  exit(0);
+#endif  
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3722671..4a93c15 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,4 +5,4 @@ include_directories(${CLI11_SOURCE_DIR}/include/CLI)
 add_library(dsaX_tests command_line_params.cpp)
 
 add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
-target_link_libraries(dsaX_correlator_test dsax dsaX_tests)
+target_link_libraries(dsaX_correlator_test dsaX dsaX_tests)
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
index fa48729..69331b3 100644
--- a/tests/command_line_params.cpp
+++ b/tests/command_line_params.cpp
@@ -14,7 +14,8 @@ bool run_correlator = false;
 double start_frequency = 1498.75;
 
 // Test file
-std::string test_filename;
+std::string input_filename = "input.dat";
+std::string output_filename = "output.dat";
 int n_channels = 384;
 int n_antennae = 63;
 int n_pol = 2;
@@ -34,7 +35,8 @@ std::shared_ptr<dsaXApp> make_app(std::string app_description, std::string app_n
   dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)");
 
   // Input file options
-  dsaX_app->add_option("--test-filename", test_filename, "Name of file on which to run tests");
+  dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests");
+  dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results");
   dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]");
   dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]");
   dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]");
diff --git a/tests/command_line_params.h b/tests/command_line_params.h
new file mode 100644
index 0000000..06e67ac
--- /dev/null
+++ b/tests/command_line_params.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <CLI.hpp>
+#include <dsaX.h>
+
+class dsaXApp : public CLI::App {
+  
+public:
+  dsaXApp(std::string app_description = "", std::string app_name = "") : CLI::App(app_description, app_name) {};
+  
+  virtual ~dsaXApp() {};
+};
+
+std::shared_ptr<dsaXApp> make_app(std::string app_description = "dsaX internal test", std::string app_name = "");
+
+// General 
+extern int core;
+extern bool debug;
+
+// Data block HDU keys 
+extern key_t in_key;
+extern key_t out_key;
+
+// Test mode
+extern bool run_beamformer;
+extern bool run_correlator;
+extern double start_frequency;
+
+// Test file
+extern std::string input_filename;
+extern std::string output_filename;
+extern int n_channels;
+extern int n_antennae;
+extern int n_pol;
+extern int n_times;
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index bea7afa..2ce7390 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -93,35 +93,16 @@ int main(int argc, char **argv) {
     return app->exit(e);
   }
   
-  // command line arguments
-  int device_ordinal = 0;
-  
-  std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl;
-  std::cout << "NCHAN = " << NCHAN << std::endl;
-  std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl;
-  std::cout << "NPOL = " << NPOL << std::endl;
-  std::cout << "NARM = " << 2 << std::endl;
-  unsigned long long size = sizeof(char);
-  size *= NPACKETS_PER_BLOCK;
-  size *= NANTS;
-  size *= NCHAN_PER_PACKET;
-  size *= NPOL;
-  size *= NCOMPLEX;
-  std::cout << "(bytes) char size * NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX = " << size << std::endl;
-  std::cout << "Expected size of data array = " << (unsigned long long)(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
-  std::cout << "Expected size of input array = " << (unsigned long long)(sizeof(char)*4*NANTS*NCHAN_PER_PACKET*NPOL*NCOMPLEX) << std::endl;
-  
-  //dsaX_init();  
+  int device_ordinal = 0;    
   FILE *fin, *fout;
   uint64_t sz, output_size, in_block_size, rd_size;
   in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
-  char * output_data, * o1;
   int nreps = 1, nchunks = 1;
 
   // read one block of input data  
   // get size of file
-  std::cout << "attempting to read file " << test_filename.c_str() << std::endl; 
-  fin=fopen(test_filename.c_str(), "rb");
+  std::cout << "attempting to read file " << input_filename.c_str() << std::endl; 
+  fin=fopen(input_filename.c_str(), "rb");
   fseek(fin, 0L, SEEK_END);
   sz = ftell(fin);
   rewind(fin);
@@ -136,9 +117,29 @@ int main(int argc, char **argv) {
     rd_size = sz;
   }
 
-  std::cout << "Creating char input_array of size " << sizeof(char)*in_block_size << std::endl;
+  // Start dsaX program
+  //---------------------------------------
+  dsaXInit(device_ordinal);
+  
+  // Create Correlator class instance.
+  dsaXCorrParam param = newDsaXCorrParam();
+  param.blas_lib = DSA_BLAS_LIB_CUBLAS;
+  param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX;
+  param.data_order = DSA_BLAS_DATAORDER_ROW;
+  printDsaXCorrParam(param);
+  auto correlator = new Correlator(&param);
+
+  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
+  std::cout << "Creating char output_array of size " << (1.0*sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2)/pow(1024,2) << " MB." << std::endl;
+  char *output_data = (char *)malloc(output_size);
+
+  std::cout << "Creating char input_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl;
   char *input_data = (char *)malloc(in_block_size);
-
+  
+  std::cout << "Computing " << nreps << " repetitions of " << nchunks << " chunks of input data of size " << rd_size << " bytes." << endl;
+  std::cout << "Total input size = " << (1.0 * nreps * nchunks * rd_size)/pow(1024,3) << " GB." << endl;
+  std::cout << "Expected output size = " << (1.0 * nreps * nchunks * output_size)/pow(1024,3) << " GB." << endl;
+  
   // Loop over reps and chunks
   for (int reps = 0; reps<nreps; reps++) {
     for (int chunks = 0; chunks<nchunks; chunks++) {
@@ -147,27 +148,32 @@ int main(int argc, char **argv) {
       if (chunks>0) rewind(fin);
       fread(input_data + chunks*rd_size, rd_size, 1, fin);
 
-      std::cout << "Input peek " << std::endl;
+      //std::cout << "Input peek " << std::endl;
       //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);
-
-      std::cout << "Creating char output_array of size " << sizeof(char)*NBASE*NCHAN_PER_PACKET*2*2*4 << std::endl;
-      output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-      output_data = (char *)malloc(output_size);
       
       // run correlator and record output data
-      syslog(LOG_INFO,"run correlator");
-      dsaXCorrelator((void*)output_data, (void*)input_data);
+      //dsaXCorrelator((void*)output_data, (void*)input_data, &param);
+      correlator->compute((void*)output_data, (void*)input_data);
       
-      std::cout << "Output peek " << std::endl;
-      for(int i=0; i<output_size; i++) inspectPackedData(output_data[i], i, true);
+      //std::cout << "Output peek " << std::endl;
+      //for(int i=0; i<output_size; i++) inspectPackedData(output_data[i], i, true);
 
-      fout = fopen("output.dat","ab");
-      fwrite((unsigned char *)output_data,sizeof(unsigned char *),output_size,fout);
+      fout = fopen(output_filename.c_str(),"ab");
+      fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout);
       fclose(fout);
       exit(0);
     }
   }
 
+  dsaXEnd();
+  // End dsaX program
+  //---------------------------------------
+
+  // free local data
+  free(input_data);
+  free(output_data);
+  return 0;
+  
   /*
   
   // Read data
@@ -209,11 +215,6 @@ int main(int argc, char **argv) {
   fwrite((float *)output_data, sizeof(float), NBASE*NCHAN_PER_PACKET*2*2, fout);
   fclose(fout);
   */
-    
-  // free
-  free(input_data);
-  free(output_data);
-  //dsaX_end();
-  
-  return 0;
+      
+
 }

From 41a8ee94857101432a6fcdb07cefb1630d80c9d3 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 28 Jun 2024 22:24:14 -0700
Subject: [PATCH 26/30] remove timer download, include header only

---
 CMakeLists.txt | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a2333..440c6f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,25 +300,6 @@ if(DSA_XENGINE_ENABLE_ZFP)
   endif()
 endif()
 
-# Get Timer dependency https://github.com/cpp-core/timer.git
-# Get timer dependency
-option(DSA_XENGINE_ENABLE_TIMER "Enable timer" ON)
-if(DSA_XENGINE_ENABLE_TIMER)
-  option(DSA_XENGINE_DOWNLOAD_TIMER "Download and build timer" ON)
-  if(DSA_XENGINE_DOWNLOAD_TIMER)
-    # Download, build and install
-    FetchContent_Declare(
-      TIMER
-      GIT_REPOSITORY https://github.com/cpp-core/timer.git
-      GIT_TAG main
-      )
-    FetchContent_MakeAvailable(TIMER)
-  else()
-    # Find and link to local install
-    find_package(TIMER REQUIRED)
-  endif()
-endif()
-
 # Add src, include, tests, and legacy
 add_subdirectory(src)
 add_subdirectory(include)

From fd273c365f707310ca519c2c338b76bb2030351f Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Thu, 4 Jul 2024 17:36:07 -0700
Subject: [PATCH 27/30] Implemented overlapping comms and compute for the
 Correlator class

---
 include/CMakeLists.txt          |   1 +
 include/dsaX.h                  |   8 +-
 include/dsaX_blas_interface.h   |   2 +-
 include/dsaX_cublas_interface.h |   2 +-
 include/dsaX_cuda_interface.h   |  29 +++-
 include/dsaX_cuda_kernels.h     |  50 ++++++-
 include/dsaX_enums.h            |   4 +
 include/dsaX_ftd.h              |  96 ++++++++++--
 include/dsaX_interface.h        |   8 +-
 include/dsaX_params.h           |  29 +---
 include/dsaX_utils.h            |   5 +-
 src/CMakeLists.txt              |   2 +
 src/dsaX_beamformer.cpp         |   2 +-
 src/dsaX_blas_interface.cpp     |   6 +-
 src/dsaX_correlator.cpp         | 155 +++++++++++++++-----
 src/dsaX_cublas_interface.cu    |  79 +++++++---
 src/dsaX_cuda_interface.cu      | 251 +++++++++++++++++++-------------
 src/dsaX_interface.cpp          | 115 ++++++++++-----
 src/dsaX_params.cpp             |  16 +-
 src/dsaX_utils.cpp              |  19 ++-
 tests/command_line_params.cpp   |  26 +++-
 tests/command_line_params.h     |   9 +-
 tests/dsaX_correlator_test.cpp  | 141 +++++++++++-------
 23 files changed, 718 insertions(+), 337 deletions(-)

diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index e8ec2d6..58b1566 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -8,6 +8,7 @@ set(DSA_XENGINE_HEADERS
   dsaX_def.h
   dsaX_ftd.h  
   dsaX_cuda_interface.h
+  dsaX_cuda_handles.h
   dsaX_cuda_headers.h
   dsaX_capture.h
   dsaX_capture_manythread.h
diff --git a/include/dsaX.h b/include/dsaX.h
index cc3ff5c..eab6f75 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -9,7 +9,7 @@
 
 // Use manual transpose route
 // Uncomment to try new pure cuBLAS
-#define OLD_BLAS
+//#define OLD_BLAS
 
 // required to prevent overflow in corr matrix multiply
 #define halfFac 4
@@ -17,9 +17,13 @@
 // beam sep
 #define sep 1.0 // arcmin
 
-void dsaXInit(int device_ordinal = 0);
+void dsaXInit(int device_ordinal = -1);
 void dsaXEnd();
 
+//void dsaX
+
+void *dsaXHostRegister(size_t size);
+
 void inspectPackedData(char input, int i, bool non_zero = false);
 
 void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param);
diff --git a/include/dsaX_blas_interface.h b/include/dsaX_blas_interface.h
index 49564b5..4c6edaf 100644
--- a/include/dsaX_blas_interface.h
+++ b/include/dsaX_blas_interface.h
@@ -2,4 +2,4 @@
 
 #include "dsaX_interface.h"
 
-void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0);
diff --git a/include/dsaX_cublas_interface.h b/include/dsaX_cublas_interface.h
index 5aea5ef..f68eea3 100644
--- a/include/dsaX_cublas_interface.h
+++ b/include/dsaX_cublas_interface.h
@@ -1,4 +1,4 @@
 #pragma once
 #include "dsaX.h"
 
-void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream);
diff --git a/include/dsaX_cuda_interface.h b/include/dsaX_cuda_interface.h
index 54e2609..4ad2aed 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/dsaX_cuda_interface.h
@@ -7,26 +7,39 @@
 #include "dsaX.h"
 
 void dsaXInitCuda(int dev);
+void dsaXDestroyCuda(int dev);
 
-void initializeCorrCudaMemory(dmem_corr *d);
+void initBLASCuda();
+void destroyBLASCuda();
 
-void initializeBFCudaMemory(dmem_bf *d);
+void initStreamsCuda(unsigned int n);
+void destroyStreamsCuda();
 
-void deallocateCorrCudaMemory(dmem_corr *d);
+void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream);
 
-void deallocateBFCudaMemory(dmem_bf *d);
+void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams);
+
+void initializeBFCudaMemory(bf_handle *d);
+
+void deallocateCorrCudaMemory(corr_handle *d);
+
+void deallocateBFCudaMemory(bf_handle *d);
 
 void dsaXmemsetCuda(void *array, int ch, size_t n);
 
-void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind);
+void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind, int stream);
+
+void *dsaXHostRegisterCuda(size_t size);
 
 void dsaXDeviceSynchronizeCuda();
 
-void reorderCorrOutputCuda(dmem_corr *d);
+void reorderCorrOutputCuda(corr_handle *d, int stream);
+
+void computeIndicesCuda(corr_handle *d);
 
-void reorderCorrInputCuda(dmem_corr *d);
+void reorderCorrInputCuda(corr_handle *d, int stream);
 
-void calcWeightsCuda(dmem_bf *d);
+void calcWeightsCuda(bf_handle *d);
 
 template <typename in_prec, typename out_prec> void transposeMatrixCuda(in_prec *idata, out_prec *odata);
 
diff --git a/include/dsaX_cuda_kernels.h b/include/dsaX_cuda_kernels.h
index 0c2cb7c..49e9ff0 100644
--- a/include/dsaX_cuda_kernels.h
+++ b/include/dsaX_cuda_kernels.h
@@ -2,11 +2,11 @@
 
 #include "dsaX_cuda_headers.h"
 
-__device__ void inspectPackedDataInKernel(char input, int i) {
+__global__ void inspectPackedDataInKernel(char input, int i) {
   float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
   float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
   
-  if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im);
+  if(re != 0 || im != 0) printf("K val[%d] = (%f,%f)\n", i, re, im);
 }
 
 // KERNELS
@@ -58,15 +58,20 @@ __global__ void corr_output_copy(half *outr, half *outi, float *output, int *ind
 
   float v1=0., v2=0.;
 
+  //if(idx<1) printf("output pre (%f, %f)\n", output[2*idx], output[2*idx+1]);
+  
   // Use CUDA casting intrinsic __half2float
   for (int i=0;i<halfFac;i++) {
     v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
     v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
+    //if(idx < 1) printf("real loop %d, (%f, %f)\n", i, __half2float(outr[(4*iidx+pol)*halfFac+i]), __half2float(outr[(4*iidx+2+pol)*halfFac+i]));
+    //if(idx < 1) printf("imag loop %d, (%f, %f)\n", i, __half2float(outi[(4*iidx+pol)*halfFac+i]), __half2float(outi[(4*iidx+2+pol)*halfFac+i]));
+    //if(idx < 1) printf("v1 = %f, v2 = %f\n", v1, v2);
   }
-
+  
   output[2*idx] = v1;
   output[2*idx+1] = v2;
-  
+  //if(idx<1) printf("output post (%f, %f)\n", output[2*idx], output[2*idx+1]);  
 }
 
 // transpose kernel
@@ -92,11 +97,42 @@ template <typename in_prec, typename out_prec> __global__ void transpose_matrix(
   y = blockIdx.x * 32 + threadIdx.y;
   width = gridDim.y * 32;
 
-  for (int j = 0; j < 32; j += 8)
-     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+  for (int j = 0; j < 32; j += 8) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+    //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x);
+  }
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_float(half * idata, half * odata) {
+  
+  __shared__ float tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //printf("K transpose_matrix_float_in[%d] =  %f\n", (y+j)*width + x, __half2float(idata[(y+j)*width + x]));
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
 
+  for (int j = 0; j < 32; j += 8) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];    
+    //printf("K transpose_matrix_float_out[%d] =  %f\n", (y+j)*width + x, __half2float(odata[(y+j)*width + x]));
+  }
 }
 
+
 // DMH: TUNABLE
 // transpose kernel
 // assume breakdown into tiles of 32x32, and run with 32x8 threads per block
@@ -126,6 +162,7 @@ __global__ void transpose_matrix_char(char * idata, char * odata) {
   for (int j = 0; j < blockDim.x; j += blockDim.y) {
     odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
     //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
   }
 }
 
@@ -165,6 +202,7 @@ __global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini
   // Cast to float and use CUDA intrinsic to cast to signed half
   ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
 
+  //good
   //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
 }
 
diff --git a/include/dsaX_enums.h b/include/dsaX_enums.h
index 9bffca0..607d9d3 100644
--- a/include/dsaX_enums.h
+++ b/include/dsaX_enums.h
@@ -57,5 +57,9 @@ typedef enum dsaXMemcpyKind_s {
   dsaXMemcpyHostToDevice = 1,
   dsaXMemcpyDeviceToHost = 2,
   dsaXMemcpyDeviceToDevice = 3,
+  dsaXMemcpyHostToHostAsync = 4,
+  dsaXMemcpyHostToDeviceAsync = 5,
+  dsaXMemcpyDeviceToHostAsync = 6,
+  dsaXMemcpyDeviceToDeviceAsync = 7,
   dsaXMemcpyInvalid = DSA_INVALID_ENUM
 } dsaXMemcpyKind;
diff --git a/include/dsaX_ftd.h b/include/dsaX_ftd.h
index 9c35043..2f05432 100644
--- a/include/dsaX_ftd.h
+++ b/include/dsaX_ftd.h
@@ -1,18 +1,24 @@
 #pragma once
 
-//#include "dsaX_def.h"
 #include "dsaX_enums.h"
 #include "dsaX_params.h"
+#include "timer.h"
+
+using ms = std::chrono::microseconds;
+using hrc = std::chrono::high_resolution_clock;
 
 // define structures that carry around memory pointers
 // and metric.
 // DMH: make a base and inherit into corr and bf
-typedef struct dmem_corr_s {
+typedef struct corr_handle_s {
   
   // initial data and streams
   char *h_input;        // host input pointer
   char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
   
+  // DMH: fix me
+  void *d_idxs;
+  
   // correlator pointers
   // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times]
   void *d_r, *d_i; //half
@@ -21,11 +27,34 @@ typedef struct dmem_corr_s {
   // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
   float *d_output;
 
-  metrics metric_data;
+  dsaXCorrParam corr_param;
+
+  double device_compute_flops;
+  double host_compute_flops;
   
-} dmem_corr;
+  double H2D_bytes;
+  double D2H_bytes;
+  double D2D_bytes;
+  double H2H_bytes;
 
-typedef struct dmem_bf_s {
+  // See 'using' at top of file for ms, hrc
+  timer::Timer<ms, hrc> dev_compute_timer;
+  timer::Timer<ms, hrc> dev_malloc_timer;
+  timer::Timer<ms, hrc> dev_memset_timer;
+  
+  timer::Timer<ms, hrc> H2D_timer;
+  timer::Timer<ms, hrc> D2H_timer;
+  timer::Timer<ms, hrc> D2D_timer;
+  timer::Timer<ms, hrc> H2H_timer;
+  
+  timer::Timer<ms, hrc> host_compute_timer;  
+  timer::Timer<ms, hrc> host_malloc_timer;
+  timer::Timer<ms, hrc> host_memset_timer;
+  timer::Timer<ms, hrc> host_copy_timer;
+  
+} corr_handle;
+
+typedef struct bf_handle_s {
 
   // beamformer pointers
   char *h_input;        // host input pointer
@@ -43,20 +72,47 @@ typedef struct dmem_bf_s {
 
   // timing (old)
   float cp, prep, cubl, outp;
-  metrics metric_data;
+
+  // See 'using' at top of file ms, hrc
+  timer::Timer<ms, hrc> dev_compute_timer;
+  timer::Timer<ms, hrc> dev_malloc_timer;
+  timer::Timer<ms, hrc> dev_memset_timer;
+    
+  timer::Timer<ms, hrc> H2D_timer;
+  timer::Timer<ms, hrc> D2H_timer;
+  
+  timer::Timer<ms, hrc> host_compute_timer;  
+  timer::Timer<ms, hrc> host_malloc_timer;
+  timer::Timer<ms, hrc> host_memset_timer;
+  timer::Timer<ms, hrc> host_copy_timer;
+  
+} bf_handle;
+
+// Deprecated function, remove after development
+void dcorrelator(corr_handle *d);
+
+// Base class
+class dsaXBase {
   
-} dmem_bf;
+ private:
+ protected:
 
-void dcorrelator(dmem_corr *d);
+ public:
+  dsaXBase();  
+  ~dsaXBase();
+  
+};
 
-class Correlator {
+class Correlator : public dsaXBase {
   
 private:
 protected:
-  
-  dmem_corr d;  
+
+  corr_handle d;  
   dsaXCorrParam corr_param;
   dsaXBLASParam blas_param;
+
+  uint64_t flops;
   
 public:
   
@@ -72,8 +128,18 @@ class Correlator {
   ~Correlator();  
 };
 
-void destroyDsaXCorrDeviceMemory(dmem_corr *d);
-void initDsaXCorrDeviceMemory(dmem_corr *d);
 
-void reorderCorrelatorOutput(dmem_corr *d);
-void reorderCorrelatorInput(dmem_corr *d);
+void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams);
+void destroyDsaXCorrDeviceMemory(corr_handle *d);
+void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int n_streams);
+
+void initBLAS();
+void destroyBLAS();
+
+void initStreams(unsigned int n);
+void destroyStreams();
+
+void computeIndices(corr_handle *d);
+void reorderCorrelatorOutput(corr_handle *d, int stream);
+void reorderCorrelatorInput(corr_handle *d, int stream);
+
diff --git a/include/dsaX_interface.h b/include/dsaX_interface.h
index a98215e..96442d1 100644
--- a/include/dsaX_interface.h
+++ b/include/dsaX_interface.h
@@ -6,9 +6,9 @@
 // DMH: decorate these with Doxygen
 void dsaXCorrelator(void *input_data, void *output_data);
 
-void reorderCorrInput(dmem_corr *d);
+void reorderCorrInput(corr_handle *d, int stream = 0);
 
-void reorderCorrOutput(dmem_corr *d);
+void reorderCorrOutput(corr_handle *d, int stream = 0);
 
 void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
 
@@ -17,3 +17,7 @@ void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char
 void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb);
 
 void sumBeam(unsigned char *input, float *output, int blocks, int tpb);
+
+void dsaXInitStream(unsigned int n_streams);
+
+//void *dsaXHostRegister(size_t size);
diff --git a/include/dsaX_params.h b/include/dsaX_params.h
index bf5f455..85d2858 100644
--- a/include/dsaX_params.h
+++ b/include/dsaX_params.h
@@ -33,7 +33,7 @@ typedef struct dsaXBLASParam_s {
   std::complex<double>  beta;     /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
   
   // Common params
-  int batch_count;             /**< number of pointers contained in arrayA, arrayB and arrayC. */
+  int batch_count;              /**< number of pointers contained in arrayA, arrayB and arrayC. */
   dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
   dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
   
@@ -41,37 +41,16 @@ typedef struct dsaXBLASParam_s {
 
 // Structure that carries Correlator class parameters
 typedef struct dsaXCorrParam_s {  
-  size_t struct_size;        /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  size_t struct_size;           /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
   
   dsaXBLASLib blas_lib;         /**< Which BLAS library to use for BLAS ops */
   dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
   dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+
+  int n_streams;                /**< The number streams over which to compute input data */
   
 } dsaXCorrParam;
 
-// Global timing and metrics structure for dsaX 
-typedef struct metrics_s {
-
-  // Mem copy times
-  double mem_copy_time_H2H;
-  double mem_copy_time_H2D;
-  double mem_copy_time_D2H;
-  double mem_copy_time_D2D;
-
-  // Mem copy size
-  double mem_copy_size_H2H;
-  double mem_copy_size_H2D;
-  double mem_copy_size_D2H;
-  double mem_copy_size_D2D;
-
-  // Compute
-  double compute_time;
-  double compute_flops;
-
-  // Initialisation
-  double initialisation_time;
-} metrics;
-
 // Parameter struct helper functions for user
 const char *getBLASLibString(dsaXBLASLib lib);
 const char *getBLASDataTypeString(dsaXBLASDataType type);
diff --git a/include/dsaX_utils.h b/include/dsaX_utils.h
index fa22abe..fbc30fc 100644
--- a/include/dsaX_utils.h
+++ b/include/dsaX_utils.h
@@ -1,7 +1,10 @@
 #pragma once
 
 #include "dsaX_params.h"
+#include "timer.h"
 
 void dsaXmemset(void *array, int ch, size_t n);
-void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind);
+
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream = 0);
+
 void dsaXDeviceSynchronize();
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f885512..d79d89f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,9 +13,11 @@ endif()
 set(DSAX_OBJS
   dsaX_cuda_interface.cu
   dsaX_cublas_interface.cu
+  dsaX_cuda_handles.cu
   dsaX_magma_interface.cu
   dsaX_blas_interface.cpp
   dsaX_beamformer.cpp
+  dsaX_base.cpp
   dsaX_correlator.cpp
   dsaX_interface.cpp
   dsaX_utils.cpp
diff --git a/src/dsaX_beamformer.cpp b/src/dsaX_beamformer.cpp
index 61fbc5d..2dc5aef 100644
--- a/src/dsaX_beamformer.cpp
+++ b/src/dsaX_beamformer.cpp
@@ -29,7 +29,7 @@ using namespace std;
 
 */
 // beamformer function
-void dbeamformer(dmem_bf *d) {
+void dbeamformer(bf_handle *d) {
 
   dsaXBLASParam blas_param;
   blas_param.trans_a = DSA_BLAS_OP_T;
diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp
index e370e87..04be79b 100644
--- a/src/dsaX_blas_interface.cpp
+++ b/src/dsaX_blas_interface.cpp
@@ -4,13 +4,13 @@
 #include "dsaX_cublas_interface.h"
 #include "dsaX_magma_interface.h"
 
-void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param) {
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) {
   switch (param.blas_lib) {
   case DSA_BLAS_LIB_CUBLAS:
-    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
     break;
   case DSA_BLAS_LIB_MAGMA:
-    dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
     break;
   case DSA_BLAS_LIB_CUTLASS:
     //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
diff --git a/src/dsaX_correlator.cpp b/src/dsaX_correlator.cpp
index fecc184..4c3fe36 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/dsaX_correlator.cpp
@@ -7,6 +7,7 @@ Workflow is similar for BF and corr applications
  */
 
 #include <iostream>
+#include <cstring>
 
 #include "dsaX_def.h"
 #include "dsaX.h"
@@ -15,25 +16,36 @@ Workflow is similar for BF and corr applications
 #include "dsaX_utils.h"
 #include "dsaX_psrdada_utils.h"
 
+using namespace std;
+
 Correlator::Correlator(const dsaXCorrParam *param) {
 
   // Transfer passed param to internal objects
   corr_param = *param;
-  //printDsaXCorrParam(corr_param);
+  d.corr_param = *param;
 
   // Select back end BLAS engine 
   blas_param.struct_size = sizeof(blas_param);
   blas_param.blas_type = DSA_BLAS_GEMM;
   blas_param.blas_lib = corr_param.blas_lib;
 
+  // Streams will be class specific
+  // so launch and destroy in the class
+  initStreams(corr_param.n_streams);
+  
   // Initialise device memeory
-  initDsaXCorrDeviceMemory(&d);
+  d.dev_malloc_timer.start();
+  initDsaXCorrDeviceMemory(&d, corr_param.n_streams);
+  d.dev_malloc_timer.stop();
+
+  // Compute indices
+  computeIndices(&d);
   
   // gemm settings
   // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
   // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
 #if defined OLD_BLAS
-  //std::cout << "Old params" << std::endl;  
+  //cout << "Old params" << endl;  
   blas_param.data_order = DSA_BLAS_DATAORDER_COL;
   blas_param.trans_a = DSA_BLAS_OP_A;
   blas_param.trans_b = DSA_BLAS_OP_T;
@@ -53,7 +65,7 @@ Correlator::Correlator(const dsaXCorrParam *param) {
   blas_param.b_offset = 0;
   blas_param.c_offset = 0;
 #else
-  //std::cout << "My params" << std::endl;
+  //cout << "My params" << endl;
   blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
   blas_param.trans_a = DSA_BLAS_OP_C;
   blas_param.trans_b = DSA_BLAS_OP_N;
@@ -73,52 +85,115 @@ Correlator::Correlator(const dsaXCorrParam *param) {
   blas_param.b_offset = 0;
   blas_param.c_offset = 0;
 #endif
-
+  
   // Swap A and B if in row order
   if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
-    std::swap(blas_param.m, blas_param.n);
-    std::swap(blas_param.lda, blas_param.ldb);
-    std::swap(blas_param.trans_a, blas_param.trans_b);
-    std::swap(blas_param.a_offset, blas_param.b_offset);
-    std::swap(blas_param.a_stride, blas_param.b_stride);
-    //std::swap(A_data, B_data);
-    //std::swap(A_data, B_data);
-  }  
+    swap(blas_param.m, blas_param.n);
+    swap(blas_param.lda, blas_param.ldb);
+    swap(blas_param.trans_a, blas_param.trans_b);
+    swap(blas_param.a_offset, blas_param.b_offset);
+    swap(blas_param.a_stride, blas_param.b_stride);
+    //swap(A_data, B_data);
+    //swap(A_data, B_data);
+  }
+
+  printDsaXBLASParam(blas_param);
+  
+  flops = 8; // 8 complex flops per element
+  flops *= blas_param.m;
+  flops *= blas_param.n;
+  flops *= blas_param.k;
+  flops *= blas_param.batch_count;
+  
+  cout << "Correlator flops = 2*M*N*K * batch = (" << 2 << "*"<< blas_param.m << "*" << blas_param.n << "*" << blas_param.k << "*" << blas_param.batch_count << ") = " << flops << endl;
+  cout << "Correlator Gflop = " << (1e-9)*flops << endl;
+
+  // DMH: reset counters method
+  
 }
 
 Correlator::~Correlator() {
+
+  // Clean up memory
   destroyDsaXCorrDeviceMemory(&d);
+  destroyStreams();
+  
+  // Transfer metrics to 
+  double device_malloc_time = (1.0*d.dev_malloc_timer.elapsed().count())/(1e6);
+  double host_malloc_time = (1.0*d.host_malloc_timer.elapsed().count())/(1e6);
+  double device_compute_time = (1.0*d.dev_compute_timer.elapsed().count())/(1e6);
+  cout << "Correlator malloc time device  = " << device_malloc_time << " seconds." << endl;
+  cout << "Correlator malloc time host    = " << host_malloc_time << " seconds." << endl;  
+  cout << "Correlator compute time device = " << device_compute_time << " seconds. " << endl;
+  
+  double h2d_time = (1.0*d.H2D_timer.elapsed().count())/(1e6);
+  cout << "Correlator H2D time            = " << h2d_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.H2D_bytes)/pow(1024,3) / h2d_time << " Gbytes/second." << endl;
+  
+  double d2h_time = (1.0*d.D2H_timer.elapsed().count())/(1e6);
+  cout << "Correlator D2H time            = " << d2h_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.D2H_bytes)/pow(1024,3) / d2h_time << " Gbytes/second." << endl;
+
+  double h2h_time = (1.0*d.H2H_timer.elapsed().count())/(1e6);
+  cout << "Correlator H2H time            = " << h2h_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.H2H_bytes)/pow(1024,3) / h2h_time << " Gbytes/second." << endl;  
+
+  double total = device_malloc_time + host_malloc_time + device_compute_time + h2d_time + d2h_time;
+  cout << "Correlator TOTAL time          = " << total << " seconds. " << endl;
+  
+  double Tflops = (1.0*d.dev_compute_timer.iterations()*(1e-12*flops)/device_compute_time);
+  cout << "Correlator Tflops              = " << Tflops <<  endl;
 }
 
 void Correlator::compute(void *output, void *input) {
   
-  // zero out output arrays
-  dsaXmemset(d.d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
-  dsaXmemset(d.d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
-  dsaXmemset(d.d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+  uint64_t in_stream_block = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
+  uint64_t out_stream_block = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2;
+
+  unsigned int n_streams = corr_param.n_streams;
   
-  // copy to device
-  dsaXmemcpy(d.d_input, input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
+  // Ensure output array is zero
+  dsaXmemset(d.d_output, 0, n_streams * out_stream_block);
   
-  // reorder input into real and imaginary arrays of half
-  reorderCorrInput(&d);
+  // Loop over the array in streams for concurrency.
+  for(int i=0; i<n_streams; i++) {
+    // copy to device  
+    dsaXmemcpy(d.d_input + i*in_stream_block, (char*)input + i*in_stream_block, in_stream_block, dsaXMemcpyHostToDeviceAsync, i);
+      
+    // reorder input into real and imaginary planar complex
+    // arrays and, if required, promote to required precision
+    // for consumption by BLAS engine.
+    promoteComplexCharToPlanarHalf(&d, i);
+    //reorderCorrInput(&d, i);
     
-  // Perform GEMM accoring to back end configuration
-  dsaXHgemmStridedBatched(d.d_r, d.d_i, d.d_r, d.d_i, d.d_outr, d.d_outi, blas_param);
+    // Perform GEMM accoring to back end configuration
+    dsaXHgemmStridedBatched((short*)d.d_r + i*in_stream_block, (short*)d.d_i + i*in_stream_block,
+			    (short*)d.d_r + i*in_stream_block, (short*)d.d_i + i*in_stream_block,
+			    (short*)d.d_outr + i*in_stream_block, (short*)d.d_outi + i*in_stream_block, blas_param, i);
+    
+    // Reorder output data back to interleaved complex
+    // and promote to float
+    reorderCorrOutput(&d, i);
+    
+    // Pass result back to host
+    d.D2H_timer.start();
+    dsaXmemcpy((float*)output + i*out_stream_block, d.d_output + i*out_stream_block, out_stream_block, dsaXMemcpyDeviceToHostAsync, i);
 
-  // reorder output data
-  reorderCorrOutput(&d);
-  
-  // Pass result back to host
-  dsaXmemcpy(output, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);  
+    d.D2H_bytes += out_stream_block;
+    d.D2H_timer.stop();
+  }
+
+  // End loop over stream. Sync to device prior to handing back
+  // scope to client program.
+  dsaXDeviceSynchronize();
 }
 
  
 // correlator function
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host
 // DMH: CUDA references excised. Make me a class
-void dcorrelator(dmem_corr *d) {
-    
+void dcorrelator(corr_handle *d) {
+
   // zero out output arrays
   dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
   dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
@@ -128,7 +203,7 @@ void dcorrelator(dmem_corr *d) {
   dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
   
   // reorder input into real and imaginary arrays of 2 byte data
-  reorderCorrInput(d);
+  reorderCorrInput(d, 0);
   
   dsaXBLASParam blas_param;
   blas_param.struct_size = sizeof(blas_param);
@@ -139,7 +214,7 @@ void dcorrelator(dmem_corr *d) {
   // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
 
 #if defined OLD_BLAS
-  //std::cout << "Old params" << std::endl;
+  //cout << "Old params" << endl;
   
   blas_param.data_order = DSA_BLAS_DATAORDER_COL;
   blas_param.trans_a = DSA_BLAS_OP_A;
@@ -160,7 +235,7 @@ void dcorrelator(dmem_corr *d) {
   blas_param.b_offset = 0;
   blas_param.c_offset = 0;
 #else
-  //std::cout << "My params" << std::endl;
+  //cout << "My params" << endl;
   
   blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
   blas_param.trans_a = DSA_BLAS_OP_C;
@@ -184,13 +259,13 @@ void dcorrelator(dmem_corr *d) {
 
   // Swap A and B if in row order
   if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
-    std::swap(blas_param.m, blas_param.n);
-    std::swap(blas_param.lda, blas_param.ldb);
-    std::swap(blas_param.trans_a, blas_param.trans_b);
-    std::swap(blas_param.a_offset, blas_param.b_offset);
-    std::swap(blas_param.a_stride, blas_param.b_stride);
-    //std::swap(A_data, B_data);
-    //std::swap(A_data, B_data);
+    swap(blas_param.m, blas_param.n);
+    swap(blas_param.lda, blas_param.ldb);
+    swap(blas_param.trans_a, blas_param.trans_b);
+    swap(blas_param.a_offset, blas_param.b_offset);
+    swap(blas_param.a_stride, blas_param.b_stride);
+    //swap(A_data, B_data);
+    //swap(A_data, B_data);
   }  
 
   
diff --git a/src/dsaX_cublas_interface.cu b/src/dsaX_cublas_interface.cu
index 0bffaea..c528546 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/dsaX_cublas_interface.cu
@@ -3,23 +3,55 @@
 #include "dsaX.h"
 #include "dsaX_params.h"
 #include "dsaX_cuda_headers.h"
+#include "dsaX_cuda_handles.h"
+//#include "dsaX_cuda_kernels.h" // For debug
 
 using namespace std;
 
-void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) {
+__global__ void deviceInspectHalf(half *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUBLAS[%d]: device inspect half [%d] =  %f\n", stage, x, __half2float(input[x])); 
+}
+
+void init_cublas_local() {
+  if (!cublas_init) {
+    //cublasError_t error = cudaStreamCreate(streams);
+    cublasStatus_t error = cublasCreate(&cublasH);
+    //cublasSetStream(handle, stream);
+    //cublasStatus_t error = cublasCreate(&handle);
+    if (error != CUBLAS_STATUS_SUCCESS)
+      cout << "cublasCreate failed with error " << error << endl;
+    else
+      cout << "cublasCreated successfully." << endl;
+    cublas_init = true;
+  }
+}
+
+void destroy_cublas_local() {
+  if(cublas_init)
+    cublasDestroy(cublasH);
+  cublas_init = false;
+}
+
+void initBLASCuda() {
+  init_cublas_local();
+}
+
+using namespace std;
+
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param, int stream) {
 #ifdef DSA_XENGINE_TARGET_CUDA
   
   // not sure if essential
-  cudaDeviceSynchronize();
+  //cudaDeviceSynchronize();
+
+  cublasSetStream(cublasH, get_stream(stream));
+
+  bool verbose = false;
   
   // Set up for gemm
-  cublasHandle_t cublasH = NULL;
-  cudaStream_t stream = NULL;
-  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  cublasCreate(&cublasH);
-  cublasSetStream(cublasH, stream);  
-
-  // Transfer params  
+  //----------------
+  // Transfer params
   const int m = blas_param.m;
   const int n = blas_param.n;
   const int k = blas_param.k;
@@ -71,9 +103,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
   default:
     std::cout << "Unknown cublas transpose" << std::endl;
   }
-
+  
   int B_imag_alpha_sign = alpha;
-    switch (blas_param.trans_b) {
+  switch (blas_param.trans_b) {
   case DSA_BLAS_OP_N:
     transb = CUBLAS_OP_N;
     break;
@@ -93,7 +125,7 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
     // A array requests conjugation, hence we
     // must apply supply a factor of -1 to alpha
     // when dealing with the imaginary component
-    // of A.
+    // of B.
     B_imag_alpha_sign *= -1;
     break;
   default:
@@ -102,9 +134,11 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
 
   // Run strided batched gemm for datatype 
   // (a + ib)(c + id) = (ac - bd) + i(bc + ad)
-  // on matrices alpha * op(A) * op(B) + beta * C
+  // on matrices C = alpha * op(A) * op(B) + beta * C
   // where op(M) is defined by the transposition variable
   // cublasOperation_t transM
+
+  //deviceInspectHalf<<<1, 8>>>((half *)real_a);
   
   // Accumulate results into C matrix
   // ac
@@ -114,6 +148,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
 			    (half *)real_b + b_offset, ldb, strideB, &beta0,
 			    (half *)real_c + c_offset, ldc, strideC,
 			    batchCount);
+
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 0);
+  
   // -bd (minus sign from i*i)
   half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign);
   cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd),
@@ -121,6 +158,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
 			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
 			    (half*)real_c + c_offset, ldc, strideC,
 			    batchCount);
+  
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 1);
+  
   // bc
   half alpha_bc = alpha * A_imag_alpha_sign;
   cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc),
@@ -128,6 +168,9 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
 			    (half*)real_b + b_offset, ldb, strideB, &beta0,
 			    (half*)imag_c + c_offset, ldc, strideC,
 			    batchCount);
+  
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 2);
+  
   // ad
   half alpha_ad = alpha * B_imag_alpha_sign;
   cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad),
@@ -135,13 +178,11 @@ void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void
 			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
 			    (half*)imag_c + c_offset, ldc, strideC,
 			    batchCount);
+
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 3);
   
-  // shown to be essential
-  cudaDeviceSynchronize();
-  
-  // destroy stream
-  cudaStreamDestroy(stream);
-  cublasDestroy(cublasH);  
+  // shown to be essential (only with streams, fix me) 
+  //cudaDeviceSynchronize();
 #else
   std::cout "dsaX not built with CUDA target." << std::endl;
   exit(0);
diff --git a/src/dsaX_cuda_interface.cu b/src/dsaX_cuda_interface.cu
index ec54675..b8af344 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/dsaX_cuda_interface.cu
@@ -4,43 +4,86 @@
 #include "dsaX_cuda_headers.h"
 #include "dsaX_cuda_interface.h"
 #include "dsaX_cuda_kernels.h"
+#include "dsaX_cuda_handles.h"
 
 using namespace std;
 
+// DMH: Everything in this file is CUDA aware.
+
+__global__ void deviceInspectHalfCI(half *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUDA_INTERFACE[%d]: device inspect half [%d] =  %f\n", stage, x, __half2float(input[x])); 
+}
+
+__global__ void deviceInspectFloatCI(float *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUDA_INTERFACE[%d]: device inspect float [%d] =  %f\n", stage, x, input[x]); 
+}
+
 void dsaXInitCuda(int dev){
-  cudaSetDevice(dev);
+  if(dev >= 0) cudaSetDevice(dev);
+  else {
+    cout << "dsaX Error: invalid device ordinal " << dev << " passed to dsaX." << endl;
+    exit(0);
+  }
+}
+
+void initStreamsCuda(unsigned int n_streams){
+  init_streams(n_streams);
+}
+
+void destroyStreamsCuda(){
+  destroy_streams();
+}
+
+void dsaXDestroyCuda(int dev){
+  //
+}
+
+void *dsaXHostRegisterCuda(size_t size) {
+
+  void *ptr = malloc(size);  
+  cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault);
+  if (err != cudaSuccess) {
+    cout << "dsaX Error: Failed to register pinned memory of size " << size << endl;
+    exit(0);
+  }
+  return ptr;
 }
 
 // allocate device memory
-void initializeCorrCudaMemory(dmem_corr *d) {
-  
+void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) {
+
   // for correlator
-  cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-  cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2);
-  cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-  cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-  cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
-  cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac);
+  cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+
+  // DMH: fix me
+  cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE);
 }
 
-void initializeBFCudaMemory(dmem_bf *d) {
+void initializeBFCudaMemory(bf_handle *d, int n_streams) {
   
   // for beamformer
-  cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2);
-  cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-  cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2);
-  cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-  cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8));
-  cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-  cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2));
-  cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS));
-  cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)); // beam scale factor
-  cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)); // beam scale factor
+  cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams);
+  cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams);
+  cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams);
+  cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams);
+  cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams);
+  cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams);
+  cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)*n_streams);
+  cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)*n_streams); // beam scale factor
+  cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); // beam scale factor
   
   // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
   d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
@@ -56,7 +99,7 @@ void initializeBFCudaMemory(dmem_bf *d) {
 }
 
 // deallocate device memory
-void deallocateCorrCudaMemory(dmem_corr *d) {
+void deallocateCorrCudaMemory(corr_handle *d) {
   
   cudaFree(d->d_input);
   cudaFree(d->d_r);
@@ -67,10 +110,11 @@ void deallocateCorrCudaMemory(dmem_corr *d) {
   cudaFree(d->d_outi);
   cudaFree(d->d_tx_outr);
   cudaFree(d->d_tx_outi);
+  cudaFree(d->d_idxs);
 }
 
 // deallocate device memory
-void deallocateBFCudaMemory(dmem_bf *d) {
+void deallocateBFCudaMemory(bf_handle *d) {
 
   cudaFree(d->d_input);
   cudaFree(d->d_tx);
@@ -89,89 +133,52 @@ void deallocateBFCudaMemory(dmem_bf *d) {
   free(d->h_freqs);
 }  
 
+void computeIndicesCuda(corr_handle *d) {
+  
+  // now run kernel to sum into output
+  int *h_idxs = (int *)malloc(sizeof(int)*NBASE);
+  int ii = 0;
+  // upper triangular order (column major) to match xGPU (not the same as CASA!)
+  for (int i=0; i<NANTS; i++) {
+    for (int j=0; j<=i; j++) {
+      h_idxs[ii] = i*NANTS + j;
+      ii++;
+    }
+  }
+  cudaMemcpy(d->d_idxs, h_idxs, sizeof(int)*NBASE, cudaMemcpyHostToDevice);
+  free(h_idxs);
+}
+
 
 // function to copy d_outr and d_outi to d_output
 // inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
 // the corr matrices are column major order
 // output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
 // start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
-void reorderCorrOutputCuda(dmem_corr * d) {
+void reorderCorrOutputCuda(corr_handle *d, int stream) {
+
+  cudaStream_t str = get_stream(stream);
+
+  uint64_t input_offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+  uint64_t output_offset = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2 * stream;
   
   // transpose input data
 #if defined (OLD_BLAS)
   dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32);
-  transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outr, (half*)d->d_tx_outr);
-  transpose_matrix<<<dimGrid, dimBlock>>>((half*)d->d_outi, (half*)d->d_tx_outi);
-#endif  
-  // look at output
-  /*char * odata = (char *)malloc(sizeof(char)*384*4*NANTS*NANTS*2*halfFac);
-  cudaMemcpy(odata,d->d_tx_outr,384*4*NANTS*NANTS*2*halfFac,cudaMemcpyDeviceToHost);
-  FILE *fout;
-  fout=fopen("test2.test","wb");
-  fwrite(odata,sizeof(char),384*4*NANTS*NANTS*2*halfFac,fout);
-  fclose(fout);*/
-
+  transpose_matrix_float<<<dimGrid, dimBlock, 0, str>>>((half*)d->d_outr, (half*)d->d_tx_outr);
+  transpose_matrix_float<<<dimGrid, dimBlock, 0, str>>>((half*)d->d_outi, (half*)d->d_tx_outi);
+#endif
   
-  /*
-  // set up for geam
-  cublasHandle_t cublasH = NULL;
-  cudaStream_t stream = NULL;
-  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  cublasSetStream(cublasH, stream);
-
-  // transpose output matrices into tx_outr and tx_outi
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  const int m = NCHAN_PER_PACKET*2*2;
-  const int n = NANTS*NANTS/16; // columns in output
-  const double alpha = 1.0;
-  const double beta = 0.0;
-  const int lda = n;
-  const int ldb = m;
-  const int ldc = ldb;
-  cublasDgeam(cublasH,transa,transb,m,n,
-	      &alpha,(double *)(d->d_outr),
-	      lda,&beta,(double *)(d->d_tx_outr),
-	      ldb,(double *)(d->d_tx_outr),ldc);
-  cublasDgeam(cublasH,transa,transb,m,n,
-	      &alpha,(double *)(d->d_outi),
-	      lda,&beta,(double *)(d->d_tx_outi),
-	      ldb,(double *)(d->d_tx_outi),ldc);
-  */
-  // now run kernel to sum into output
-  int * h_idxs = (int *)malloc(sizeof(int)*NBASE);
-  int * d_idxs;
-  cudaMalloc((void **)(&d_idxs), sizeof(int)*NBASE);
-  int ii = 0;
-  // upper triangular order (column major) to match xGPU (not the same as CASA!)
-  for (int i=0;i<NANTS;i++) {
-    for (int j=0;j<=i;j++) {
-      h_idxs[ii] = i*NANTS + j;
-      ii++;
-    }
-  }
-  cudaMemcpy(d_idxs, h_idxs, sizeof(int)*NBASE,cudaMemcpyHostToDevice);
-
   // run kernel to finish things
   // TUNABLE
   int blockDim = 128;
   int blocks = NCHAN_PER_PACKET*2*NBASE/blockDim;
 #if defined (OLD_BLAS)
-  corr_output_copy<<<blocks, blockDim>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, d_idxs);
+  corr_output_copy<<<blocks, blockDim, 0, str>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, (int*)d->d_idxs);
 #else
-  corr_output_copy<<<blocks, blockDim>>>((half*)d->d_outr, (half*)d->d_outi, d->d_output, d_idxs);
-#endif
-  
-  /*char * odata = (char *)malloc(sizeof(char)*384*4*NBASE*4);
-  cudaMemcpy(odata,d->d_output,384*4*NBASE*4,cudaMemcpyDeviceToHost);
-  FILE *fout;
-  fout=fopen("test3.test","wb");
-  fwrite(odata,sizeof(char),384*4*NBASE*4,fout);
-  fclose(fout);*/
-  
-  cudaFree(d_idxs);
-  free(h_idxs);
-  //cudaStreamDestroy(stream);  
+  corr_output_copy<<<blocks, blockDim, 0, str>>>((half*)d->d_outr + input_offset, (half*)d->d_outi + input_offset, d->d_output + output_offset, (int*)d->d_idxs);
+#endif  
+  //deviceInspectHalfCI<<<1,8>>>((half*)d->d_outi, 0);  
 }
 
 
@@ -182,22 +189,43 @@ void reorderCorrOutputCuda(dmem_corr * d) {
 // output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
 // starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
 // then fluffs using simple kernel
-void reorderCorrInputCuda(dmem_corr *d) {
+void reorderCorrInputCuda(corr_handle *d, int stream) {
+
+  // DMH: globalise me
+  int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+
+  cudaStream_t str = get_stream(stream);
+  
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
   
   // transpose input data
 #if defined (OLD_BLAS)  
   dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
 
-  // TUNABLE
-  int blockDim = 128;
-  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
-  transpose_matrix_char<<<dimGrid, dimBlock>>>(d->d_input, d->d_tx);
-  promoteComplexCharToPlanarHalf<<<blocks, blockDim>>>(d->d_tx, (half*)d->d_r, (half*)d->d_i);
+  transpose_matrix_char<<<dimGrid, dimBlock, 0, str>>>((char*)d->d_input + offset, (char*)d->d_tx + offset);
+
+  // DMH: These two can run concurrently
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_tx + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
 #else
-  promoteComplexCharToPlanarHalf<<<blocks, blockDim>>>(d->d_input, (half*)d->d_r, (half*)d->d_i);
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
 #endif
 }
 
+void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream) {
+
+  // DMH: globalise me
+  int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+
+  cudaStream_t str = get_stream(stream);
+  
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
+
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
+}
 
 // kernels to reorder and fluff input data for beamformer
 // initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
@@ -223,7 +251,7 @@ void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int>
 // sequential pairs of eastings and northings
 // then [NANTS, 48, R/I] calibs
 
-void calcWeightsCuda(dmem_bf *d) {
+void calcWeightsCuda(bf_handle *d) {
 
   // allocate
   float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
@@ -323,21 +351,36 @@ void dsaXDeviceSynchronizeCuda() {
   cudaDeviceSynchronize();
 }
 
-void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
+void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){
+
   cudaError error = cudaSuccess;
+  cudaStream_t str = get_stream(stream);
+  
   switch(kind) {
   case dsaXMemcpyHostToHost:
     error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost);
     break;
   case dsaXMemcpyHostToDevice:
-   error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice);
-   break;
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice);
+    break;
   case dsaXMemcpyDeviceToHost:
     error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost);
     break;
   case dsaXMemcpyDeviceToDevice:
     error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice);
     break;
+  case dsaXMemcpyHostToHostAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToHost, str);
+    break;
+  case dsaXMemcpyHostToDeviceAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToDevice, str);
+    break;
+  case dsaXMemcpyDeviceToHostAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToHost, str);
+    break;
+  case dsaXMemcpyDeviceToDeviceAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToDevice, str);
+    break;
   default:
     std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl;
   }
diff --git a/src/dsaX_interface.cpp b/src/dsaX_interface.cpp
index 6358df1..e0f294a 100644
--- a/src/dsaX_interface.cpp
+++ b/src/dsaX_interface.cpp
@@ -10,82 +10,131 @@
 
 using namespace std;
 
+using ms = std::chrono::microseconds;
+using hrc = std::chrono::high_resolution_clock;  
+
+timer::Timer<ms, hrc> app_timer;
+timer::Timer<ms, hrc> init_timer;
 
 void dsaXInit(int dev){
+  app_timer.start();
 #if DSA_XENGINE_TARGET_CUDA
+  init_timer.start();
   dsaXInitCuda(dev);
+  initBLAS();
+  init_timer.stop();
 #endif
-
-  std::cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl;
-  std::cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << std::endl;
-  std::cout << "NCHAN = " << NCHAN << std::endl;
-  std::cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << std::endl;
-  std::cout << "NPOL = " << NPOL << std::endl;
-  std::cout << "NARM = " << 3 << std::endl;
-  std::cout << " --- End dsaX configuration --- " << endl;
+  cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl;
+  cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << endl;
+  cout << "NCHAN = " << NCHAN << endl;
+  cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << endl;
+  cout << "NPOL = " << NPOL << endl;
+  cout << "NARM = " << 2 << endl;
+#if DSA_XENGINE_TARGET_CUDA
+  cout << "CUDA is ENABLED " << endl;
+#else
+  cout << "CUDA is DISABLED " << endl;
+#endif
+  cout << " --- End dsaX configuration --- " << endl;
   //DMH: Add more (ask Vikram)
 }
 
 void dsaXEnd() {
+  app_timer.stop();
   // output metrics
+  cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl;
+  cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl;
+}
+
+void *dsaXHostRegister(size_t size) {
+#if DSA_XENGINE_TARGET_CUDA  
+  return dsaXHostRegisterCuda(size);
+#endif
 }
 
 void inspectPackedData(char input, int i, bool non_zeros) {
   float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
   float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
-
+  
   if(non_zeros) {
     if(re != 0 || im != 0) 
-      std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+      cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << endl;
   } else {
-    std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+    cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << endl;
   }
 }
 
-void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param) {  
+void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int stream) {
+#if DSA_XENGINE_TARGET_CUDA
+  promoteComplexCharToPlanarHalfCuda(d, stream);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
 
-  dmem_corr d;
-#if DSA_XENGINE_TARGET_CUDA  
-  initializeCorrCudaMemory(&d);
-  d.h_input = (char *)malloc(sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-  memcpy(d.h_input, (char*)input_data, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2);
-  dcorrelator(&d);
-  dsaXmemcpy(output_data, d.d_output, NBASE*NCHAN_PER_PACKET*2*2*4, dsaXMemcpyDeviceToHost);
-  deallocateCorrCudaMemory(&d);
+void reorderCorrInput(corr_handle *d, int stream) {
+#if DSA_XENGINE_TARGET_CUDA
+  reorderCorrInputCuda(d, stream);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
-void reorderCorrInput(dmem_corr *d) {
+void initBLAS() {
 #if DSA_XENGINE_TARGET_CUDA
-  reorderCorrInputCuda(d);
+  // DMH: Fix me for orther libs
+  initBLASCuda();
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
-void reorderCorrOutput(dmem_corr *d) {
+void initStreams(unsigned int n_streams) {
+#if DSA_XENGINE_TARGET_CUDA
+  initStreamsCuda(n_streams);
+#else
+  // NO OP
+#endif
+}
+
+void destroyStreams() {
+#if DSA_XENGINE_TARGET_CUDA
+  destroyStreamsCuda();
+#else
+  // NO OP
+#endif
+}
+
+void computeIndices(corr_handle *d) {
+#if DSA_XENGINE_TARGET_CUDA
+  computeIndicesCuda(d);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+
+void reorderCorrOutput(corr_handle *d, int stream) {
 #if DSA_XENGINE_TARGET_CUDA  
-  reorderCorrOutputCuda(d);
+  reorderCorrOutputCuda(d, stream);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
-void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid) {
+void transposeInputBeamformer(double *input, double *output, vector<int> &dimBlock, vector<int> &dimGrid) {
 #if DSA_XENGINE_TARGET_CUDA
   transposeInputBeamformerCuda(input, output, dimBlock, dimGrid);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
-void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid) {
+void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, vector<int> &dimBlock, vector<int> &dimGrid) {
 #if DSA_XENGINE_TARGET_CUDA
   transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
@@ -93,7 +142,7 @@ void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int b
 #if DSA_XENGINE_TARGET_CUDA
   fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
 
@@ -101,6 +150,6 @@ void sumBeam(unsigned char *input, float *output, int blocks, int tpb) {
 #if DSA_XENGINE_TARGET_CUDA
   sumBeamCuda(input, output, blocks, tpb);
 #else
-  std::cout << "dsaX error: not implemented" << std::endl;
+  cout << "dsaX error: not implemented" << endl;
 #endif
 }
diff --git a/src/dsaX_params.cpp b/src/dsaX_params.cpp
index 7ed0d5b..4179848 100644
--- a/src/dsaX_params.cpp
+++ b/src/dsaX_params.cpp
@@ -56,11 +56,13 @@ const char *getBLASDataOrderString(dsaXBLASDataOrder order)
 
 void printDsaXCorrParam(const dsaXCorrParam param) {
 
-  cout << "--- dsaXCorrParam begin ---" << endl;
+  cout << " --- dsaXCorrParam begin ---" << endl;
   cout << "struct_size = " << param.struct_size << endl;
-  cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl;
-  cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl;
-  cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl;
+  cout << "blas_lib    = " << getBLASLibString(param.blas_lib) << endl;
+  cout << "data_type   = " << getBLASDataTypeString(param.data_type) << endl;
+  cout << "data_order  = " << getBLASDataOrderString(param.data_order) << endl;
+  cout << "n_streams   = " << param.n_streams << endl;
+  
   cout << " --- dsaXCorrParam end ---" << endl;
 }
 
@@ -69,9 +71,9 @@ void printDsaXBLASParam(const dsaXBLASParam param) {
   cout << " --- dsaXBLASParam begin ---" << endl;
   cout << "struct_size = " << param.struct_size << endl;
   cout << "blas_type = " << param.blas_type << endl;
-  cout << "blas_lib = " << param.blas_lib << endl;
-  cout << "data_type = " << param.data_type << endl;
-  cout << "data_order = " << param.data_order << endl;
+  cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl;
+  cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl;
+  cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl;
   cout << "trans_a = " << param.trans_a << endl;
   cout << "trans_b = " << param.trans_b << endl;
   cout << "m = " << param.m << endl;
diff --git a/src/dsaX_utils.cpp b/src/dsaX_utils.cpp
index 3819e98..d29e291 100644
--- a/src/dsaX_utils.cpp
+++ b/src/dsaX_utils.cpp
@@ -15,10 +15,11 @@ void dsaXmemset(void *array, int ch, size_t n){
 #endif
 }
 
-void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind){
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){
+
 #ifdef DSA_XENGINE_TARGET_CUDA
   // Perform host to device memcopy on data
-  dsaXmemcpyCuda(array_out, array_in, n, kind);
+  dsaXmemcpyCuda(array_out, array_in, n, kind, stream);
 #else  
   memcpy(array_out, array_in, n);
 #endif
@@ -33,19 +34,25 @@ void dsaXDeviceSynchronize() {
 #endif
 }
 
-void initDsaXCorrDeviceMemory(dmem_corr *d) {
+void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams) {
+
 #ifdef DSA_XENGINE_TARGET_CUDA
-  initializeCorrCudaMemory(d);
+  d->dev_malloc_timer.start();
+  initializeCorrCudaMemory(d, n_streams);
+  d->dev_malloc_timer.stop();
 #else  
   cout << "dsaX Error: Not implemented." << endl;
   exit(0);
 #endif  
 }
 
-void destroyDsaXCorrDeviceMemory(dmem_corr *d) {
+void destroyDsaXCorrDeviceMemory(corr_handle *d) {
+
 #ifdef DSA_XENGINE_TARGET_CUDA
+  d->dev_malloc_timer.start();
   deallocateCorrCudaMemory(d);
-#else  
+  d->dev_malloc_timer.stop();
+#else
   cout << "dsaX Error: Not implemented." << endl;
   exit(0);
 #endif  
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
index 69331b3..746b4cc 100644
--- a/tests/command_line_params.cpp
+++ b/tests/command_line_params.cpp
@@ -5,21 +5,27 @@ int core = 0;
 bool debug = false;
 
 // Data block HDU keys 
-key_t in_key = REORDER_BLOCK_KEY;
-key_t out_key = XGPU_BLOCK_KEY;
+key_t in_key = 0x0000eada;  // REORDER_BLOCK_KEY in dsaX_def.h
+key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h
 
-// Test mode
+// Test params
 bool run_beamformer = false;
 bool run_correlator = false;
-double start_frequency = 1498.75;
+bool input_rands = false;
+bool write_output = false;
+int test_iter = 1;
+int n_streams = 10;
 
-// Test file
+// Test files
 std::string input_filename = "input.dat";
 std::string output_filename = "output.dat";
+
+// DSA hardware configuration
 int n_channels = 384;
 int n_antennae = 63;
 int n_pol = 2;
 int n_times = 30720;
+double start_frequency = 1498.75;
 
 std::shared_ptr<dsaXApp> make_app(std::string app_description, std::string app_name) {
 
@@ -32,15 +38,19 @@ std::shared_ptr<dsaXApp> make_app(std::string app_description, std::string app_n
   dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]");
   dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]");
   dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]");
-  dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)");
-
+  dsaX_app->add_option("--test-iter", test_iter, "Run the test 'test_iter' times [default 1]");
+  dsaX_app->add_option("--write-output", write_output, "Write output to disk [default true]");
+  dsaX_app->add_option("--n-streams", n_streams, "The number of device streams [default 10]");
+  
   // Input file options
+  dsaX_app->add_option("--input-rands", input_rands, "Generate random input (default false)");
   dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests");
+
   dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results");
   dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]");
   dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]");
   dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]");
   dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]");
-
+  dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)");
   return dsaX_app;
 }
diff --git a/tests/command_line_params.h b/tests/command_line_params.h
index 06e67ac..fb9bd1a 100644
--- a/tests/command_line_params.h
+++ b/tests/command_line_params.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <CLI.hpp>
-#include <dsaX.h>
 
 class dsaXApp : public CLI::App {
   
@@ -24,12 +23,16 @@ extern key_t out_key;
 // Test mode
 extern bool run_beamformer;
 extern bool run_correlator;
-extern double start_frequency;
+extern bool input_rands;
+extern bool write_output;
+extern int test_iter;
+extern int n_streams;
 
-// Test file
+// DSA hardware configureation
 extern std::string input_filename;
 extern std::string output_filename;
 extern int n_channels;
 extern int n_antennae;
 extern int n_pol;
 extern int n_times;
+extern double start_frequency;
diff --git a/tests/dsaX_correlator_test.cpp b/tests/dsaX_correlator_test.cpp
index 2ce7390..dfb58f0 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/dsaX_correlator_test.cpp
@@ -6,6 +6,7 @@
 #include <math.h>
 #include <string.h>
 #include <syslog.h>
+#include <random>
 
 // Include this file to access input parameters
 #include "command_line_params.h"
@@ -93,32 +94,66 @@ int main(int argc, char **argv) {
     return app->exit(e);
   }
   
-  int device_ordinal = 0;    
+  int device_ordinal = 0;
+  int packet_size = 4608;
+
+  // Create a data array for a single call to the correlator class
   FILE *fin, *fout;
-  uint64_t sz, output_size, in_block_size, rd_size;
+  uint64_t sz, in_block_size, rd_size;
   in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
-  int nreps = 1, nchunks = 1;
+  
+  std::cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl;
+  char *file_data = (char *)malloc(in_block_size);  
 
   // read one block of input data  
   // get size of file
-  std::cout << "attempting to read file " << input_filename.c_str() << std::endl; 
-  fin=fopen(input_filename.c_str(), "rb");
-  fseek(fin, 0L, SEEK_END);
-  sz = ftell(fin);
-  rewind(fin);
-
-  // figure out how many reps and chunks to read with
-  if (sz > in_block_size) {
-    nreps = (int)(sz/in_block_size);
-    rd_size = in_block_size;
-  }
-  else {
-    nchunks = (int)(in_block_size/sz);
-    rd_size = sz;
-  }
+  if(!input_rands) {
+    std::cout << "attempting to read file " << input_filename.c_str() << std::endl; 
+    fin = fopen(input_filename.c_str(), "rb");
+    fseek(fin, 0L, SEEK_END);
+    sz = ftell(fin);
+    if(sz != packet_size) {
+      cout << "Error: packet size " << packet_size << " and file size " << sz << " are unequal." << endl;
+      exit(0);
+    }
+    rewind(fin);
+
+    // figure out how many reps and chunks to read with
+    int nreps, nchunks;
+    if (sz > in_block_size) {
+      nreps = (int)(sz/in_block_size);
+      rd_size = in_block_size;
+    }
+    else {
+      nchunks = (int)(in_block_size/sz);
+      rd_size = sz;
+    }
+
+    cout << "Packet size = " << sz << endl;
+    cout << "rd size = " << rd_size << endl;
+    for (int reps = 0; reps<nreps; reps++) {
+      for (int chunks = 0; chunks < nchunks; chunks++) {	
+	fread(file_data + (chunks + reps * nchunks)*rd_size , rd_size, 1, fin);
+      }
+    }
+  } else {
+    int n_rand = in_block_size/sizeof(uint64_t);
+    uint64_t *input_rand = (uint64_t*)malloc(n_rand);
 
+    std::random_device rd;
+    std::mt19937_64 gen(rd());
+    gen.seed(1234);
+    std::uniform_int_distribution<uint64_t> dis;
+    for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen);
+    //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234;
+    memcpy(file_data, (void*)input_rand, n_rand);
+    free(input_rand);
+  }
+  
   // Start dsaX program
   //---------------------------------------
+  timer::Timer<std::chrono::microseconds, std::chrono::high_resolution_clock> test_timer;
+
   dsaXInit(device_ordinal);
   
   // Create Correlator class instance.
@@ -126,46 +161,49 @@ int main(int argc, char **argv) {
   param.blas_lib = DSA_BLAS_LIB_CUBLAS;
   param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX;
   param.data_order = DSA_BLAS_DATAORDER_ROW;
+  param.n_streams = n_streams;
   printDsaXCorrParam(param);
+  
   auto correlator = new Correlator(&param);
 
-  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
-  std::cout << "Creating char output_array of size " << (1.0*sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2)/pow(1024,2) << " MB." << std::endl;
-  char *output_data = (char *)malloc(output_size);
+  // Create GPU registered memory if using CUDA 
+  uint64_t input_size = n_streams*sizeof(char)*in_block_size;
+  std::cout << "Creating char input array of size " << input_size << " bytes." << std::endl;
+  void *input_data = dsaXHostRegister(input_size);
+  // Populate with random data. Each stream has the same data
+  // To ensure the concurrency does not pollute accross streams. 
+  for (int i = 0; i<n_streams; i++) memcpy((char*)input_data + i * in_block_size, file_data, in_block_size);
 
-  std::cout << "Creating char input_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl;
-  char *input_data = (char *)malloc(in_block_size);
+  // Create GPU registered output array
+  uint64_t output_size = n_streams * sizeof(float) * NBASE*NCHAN_PER_PACKET*2*2;
+  std::cout << "Creating float output_array of size " << output_size << " bytes." << std::endl;
+  void *output_data = dsaXHostRegister(output_size);
+
+  // Ensure test output array is zero
+  memset(output_data, 0, output_size);
   
-  std::cout << "Computing " << nreps << " repetitions of " << nchunks << " chunks of input data of size " << rd_size << " bytes." << endl;
-  std::cout << "Total input size = " << (1.0 * nreps * nchunks * rd_size)/pow(1024,3) << " GB." << endl;
-  std::cout << "Expected output size = " << (1.0 * nreps * nchunks * output_size)/pow(1024,3) << " GB." << endl;
+  std::cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl;
+  std::cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl;
   
-  // Loop over reps and chunks
-  for (int reps = 0; reps<nreps; reps++) {
-    for (int chunks = 0; chunks<nchunks; chunks++) {
-
-      // Read input file
-      if (chunks>0) rewind(fin);
-      fread(input_data + chunks*rd_size, rd_size, 1, fin);
-
-      //std::cout << "Input peek " << std::endl;
-      //for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);
-      
-      // run correlator and record output data
-      //dsaXCorrelator((void*)output_data, (void*)input_data, &param);
-      correlator->compute((void*)output_data, (void*)input_data);
-      
-      //std::cout << "Output peek " << std::endl;
-      //for(int i=0; i<output_size; i++) inspectPackedData(output_data[i], i, true);
-
-      fout = fopen(output_filename.c_str(),"ab");
-      fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout);
-      fclose(fout);
-      exit(0);
-    }
+  test_timer.start();  
+  correlator->compute(output_data, input_data);
+  test_timer.stop();
+  
+  //std::cout << "Output peek " << std::endl;
+  float *p = (float*)output_data;
+  for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl;
+  
+  if(write_output) {
+    fout = fopen(output_filename.c_str(),"ab");
+    fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout);
+    fclose(fout);
   }
-
+  
+  delete correlator;
   dsaXEnd();
+
+  std::cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl;
+  
   // End dsaX program
   //---------------------------------------
 
@@ -174,8 +212,7 @@ int main(int argc, char **argv) {
   free(output_data);
   return 0;
   
-  /*
-  
+  /*  
   // Read data
   BinaryFileVector binaryFileVector;
 

From 48d6b6d3171525261dfd878cc4aaad3ed52e005d Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Thu, 4 Jul 2024 21:40:40 -0700
Subject: [PATCH 28/30] Rename files for convenient auto complete in CL

---
 include/CMakeLists.txt                        |  10 +-
 ...dsaX_blas_interface.h => blas_interface.h} |   2 +-
 ..._cublas_interface.h => cublas_interface.h} |   0
 include/cuda_handles.h                        |  20 ++
 .../{dsaX_cuda_headers.h => cuda_headers.h}   |   0
 ...dsaX_cuda_interface.h => cuda_interface.h} |   2 +-
 .../{dsaX_cuda_kernels.h => cuda_kernels.h}   |   2 +-
 ...utlass_interface.h => cutlass_interface.h} |   0
 include/dsaX.h                                |   6 +-
 include/dsaX_beamformer_correlator.h          |   9 -
 include/{dsaX_enums.h => enums.h}             |   0
 include/{dsaX_ftd.h => fast_time_domain.h}    |   4 +-
 include/{dsaX_interface.h => interface.h}     |   0
 .../{dsaX_magma_headers.h => magma_headers.h} |   0
 ...aX_magma_interface.h => magma_interface.h} |   0
 include/{dsaX_params.h => params.h}           |   2 +-
 .../{dsaX_psrdada_utils.h => psrdada_utils.h} |   2 +-
 include/{dsaX_utils.h => utils.h}             |   2 +-
 src/CMakeLists.txt                            |  37 ++--
 src/{dsaX_beamformer.cpp => beamformer.cpp}   |   6 +-
 src/{dsaX_correlator.cpp => correlator.cpp}   |   8 +-
 ...ublas_interface.cu => cublas_interface.cu} |   6 +-
 src/cuda_handles.cu                           |  64 +++++++
 ...aX_cuda_interface.cu => cuda_interface.cu} |   8 +-
 ...lass_interface.cu => cutlass_interface.cu} |   0
 src/dsaX_blas_interface.cpp                   |  28 ---
 src/{dsaX_interface.cpp => interface.cpp}     |   8 +-
 ..._magma_interface.cu => magma_interface.cu} |   6 +-
 src/{dsaX_params.cpp => params.cpp}           |   2 +-
 ...aX_psrdada_utils.cpp => psrdada_utils.cpp} |   2 +-
 src/{dsaX_utils.cpp => utils.cpp}             |   8 +-
 tests/CMakeLists.txt                          |   4 +-
 tests/command_line_params.cpp                 |   4 +-
 ...orrelator_test.cpp => correlator_test.cpp} | 174 ++++++++++++++----
 34 files changed, 284 insertions(+), 142 deletions(-)
 rename include/{dsaX_blas_interface.h => blas_interface.h} (85%)
 rename include/{dsaX_cublas_interface.h => cublas_interface.h} (100%)
 create mode 100644 include/cuda_handles.h
 rename include/{dsaX_cuda_headers.h => cuda_headers.h} (100%)
 rename include/{dsaX_cuda_interface.h => cuda_interface.h} (98%)
 rename include/{dsaX_cuda_kernels.h => cuda_kernels.h} (99%)
 rename include/{dsaX_cutlass_interface.h => cutlass_interface.h} (100%)
 delete mode 100644 include/dsaX_beamformer_correlator.h
 rename include/{dsaX_enums.h => enums.h} (100%)
 rename include/{dsaX_ftd.h => fast_time_domain.h} (98%)
 rename include/{dsaX_interface.h => interface.h} (100%)
 rename include/{dsaX_magma_headers.h => magma_headers.h} (100%)
 rename include/{dsaX_magma_interface.h => magma_interface.h} (100%)
 rename include/{dsaX_params.h => params.h} (99%)
 rename include/{dsaX_psrdada_utils.h => psrdada_utils.h} (93%)
 rename include/{dsaX_utils.h => utils.h} (89%)
 rename src/{dsaX_beamformer.cpp => beamformer.cpp} (98%)
 rename src/{dsaX_correlator.cpp => correlator.cpp} (98%)
 rename src/{dsaX_cublas_interface.cu => cublas_interface.cu} (98%)
 create mode 100644 src/cuda_handles.cu
 rename src/{dsaX_cuda_interface.cu => cuda_interface.cu} (99%)
 rename src/{dsaX_cutlass_interface.cu => cutlass_interface.cu} (100%)
 delete mode 100644 src/dsaX_blas_interface.cpp
 rename src/{dsaX_interface.cpp => interface.cpp} (97%)
 rename src/{dsaX_magma_interface.cu => magma_interface.cu} (86%)
 rename src/{dsaX_params.cpp => params.cpp} (99%)
 rename src/{dsaX_psrdada_utils.cpp => psrdada_utils.cpp} (90%)
 rename src/{dsaX_utils.cpp => utils.cpp} (91%)
 rename tests/{dsaX_correlator_test.cpp => correlator_test.cpp} (53%)

diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 58b1566..65ddb04 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -6,14 +6,14 @@ set(DSA_XENGINE_HEADERS
   # cmake-format: sortable
   dsaX.h
   dsaX_def.h
-  dsaX_ftd.h  
-  dsaX_cuda_interface.h
-  dsaX_cuda_handles.h
-  dsaX_cuda_headers.h
+  fast_time_domain.h  
+  cuda_interface.h
+  cuda_handles.h
+  cuda_headers.h
   dsaX_capture.h
   dsaX_capture_manythread.h
   dsaX_capture_pcap.h
-  dsaX_cutlass_interface.h
+  cutlass_interface.h
   )
 install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
 #------------------------------
diff --git a/include/dsaX_blas_interface.h b/include/blas_interface.h
similarity index 85%
rename from include/dsaX_blas_interface.h
rename to include/blas_interface.h
index 4c6edaf..d643e08 100644
--- a/include/dsaX_blas_interface.h
+++ b/include/blas_interface.h
@@ -1,5 +1,5 @@
 #pragma once
 
-#include "dsaX_interface.h"
+#include "interface.h"
 
 void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0);
diff --git a/include/dsaX_cublas_interface.h b/include/cublas_interface.h
similarity index 100%
rename from include/dsaX_cublas_interface.h
rename to include/cublas_interface.h
diff --git a/include/cuda_handles.h b/include/cuda_handles.h
new file mode 100644
index 0000000..eeaf706
--- /dev/null
+++ b/include/cuda_handles.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <vector>
+
+#include "utils.h"
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+#include "cuda_headers.h"
+
+static std::vector<cudaStream_t> streams;
+static cublasHandle_t cublasH = NULL;
+
+static bool cublas_init = false;
+static bool stream_init = false;
+
+cudaStream_t get_stream(unsigned int i);
+#endif
+
+void init_streams(unsigned int n_streams);
+void destroy_streams();
diff --git a/include/dsaX_cuda_headers.h b/include/cuda_headers.h
similarity index 100%
rename from include/dsaX_cuda_headers.h
rename to include/cuda_headers.h
diff --git a/include/dsaX_cuda_interface.h b/include/cuda_interface.h
similarity index 98%
rename from include/dsaX_cuda_interface.h
rename to include/cuda_interface.h
index 4ad2aed..6ae59e2 100644
--- a/include/dsaX_cuda_interface.h
+++ b/include/cuda_interface.h
@@ -3,7 +3,7 @@
 #include <vector>
 
 #include "dsaX_def.h"
-#include "dsaX_enums.h"
+#include "enums.h"
 #include "dsaX.h"
 
 void dsaXInitCuda(int dev);
diff --git a/include/dsaX_cuda_kernels.h b/include/cuda_kernels.h
similarity index 99%
rename from include/dsaX_cuda_kernels.h
rename to include/cuda_kernels.h
index 49e9ff0..d57a11b 100644
--- a/include/dsaX_cuda_kernels.h
+++ b/include/cuda_kernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "dsaX_cuda_headers.h"
+#include "cuda_headers.h"
 
 __global__ void inspectPackedDataInKernel(char input, int i) {
   float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
diff --git a/include/dsaX_cutlass_interface.h b/include/cutlass_interface.h
similarity index 100%
rename from include/dsaX_cutlass_interface.h
rename to include/cutlass_interface.h
diff --git a/include/dsaX.h b/include/dsaX.h
index eab6f75..8aff8c5 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -3,9 +3,9 @@
 // Expose the use to compile time definitions,
 // enums, parameters, and classes
 #include "dsaX_def.h"
-#include "dsaX_enums.h"
-#include "dsaX_params.h"
-#include "dsaX_ftd.h"
+#include "enums.h"
+#include "params.h"
+#include "fast_time_domain.h"
 
 // Use manual transpose route
 // Uncomment to try new pure cuBLAS
diff --git a/include/dsaX_beamformer_correlator.h b/include/dsaX_beamformer_correlator.h
deleted file mode 100644
index 7001f4a..0000000
--- a/include/dsaX_beamformer_correlator.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-// correlator function
-// workflow: copy to device, reorder, stridedBatchedGemm, reorder
-void dcorrelator(dmem *d);
-
-// beamformer function
-void dbeamformer(dmem * d);
-
diff --git a/include/dsaX_enums.h b/include/enums.h
similarity index 100%
rename from include/dsaX_enums.h
rename to include/enums.h
diff --git a/include/dsaX_ftd.h b/include/fast_time_domain.h
similarity index 98%
rename from include/dsaX_ftd.h
rename to include/fast_time_domain.h
index 2f05432..98ce8ff 100644
--- a/include/dsaX_ftd.h
+++ b/include/fast_time_domain.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "dsaX_enums.h"
-#include "dsaX_params.h"
+#include "enums.h"
+#include "params.h"
 #include "timer.h"
 
 using ms = std::chrono::microseconds;
diff --git a/include/dsaX_interface.h b/include/interface.h
similarity index 100%
rename from include/dsaX_interface.h
rename to include/interface.h
diff --git a/include/dsaX_magma_headers.h b/include/magma_headers.h
similarity index 100%
rename from include/dsaX_magma_headers.h
rename to include/magma_headers.h
diff --git a/include/dsaX_magma_interface.h b/include/magma_interface.h
similarity index 100%
rename from include/dsaX_magma_interface.h
rename to include/magma_interface.h
diff --git a/include/dsaX_params.h b/include/params.h
similarity index 99%
rename from include/dsaX_params.h
rename to include/params.h
index 85d2858..08ff440 100644
--- a/include/dsaX_params.h
+++ b/include/params.h
@@ -2,7 +2,7 @@
 
 #include <complex>
 
-#include "dsaX_enums.h"
+#include "enums.h"
 
 // Structure that carries BLAS parameters
 // This should be able to communicate to all
diff --git a/include/dsaX_psrdada_utils.h b/include/psrdada_utils.h
similarity index 93%
rename from include/dsaX_psrdada_utils.h
rename to include/psrdada_utils.h
index 2dc3dec..2b60bf3 100644
--- a/include/dsaX_psrdada_utils.h
+++ b/include/psrdada_utils.h
@@ -9,7 +9,7 @@
 #include "dada_affinity.h"
 #include "ascii_header.h"
 #include "dsaX_def.h"
-#include "dsaX_enums.h"
+#include "enums.h"
 
 void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
 
diff --git a/include/dsaX_utils.h b/include/utils.h
similarity index 89%
rename from include/dsaX_utils.h
rename to include/utils.h
index fbc30fc..96a7004 100644
--- a/include/dsaX_utils.h
+++ b/include/utils.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "dsaX_params.h"
+#include "params.h"
 #include "timer.h"
 
 void dsaXmemset(void *array, int ch, size_t n);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d79d89f..67f8543 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,18 +11,18 @@ endif()
 # DSA Fast Time Domain library
 #-----------------------------
 set(DSAX_OBJS
-  dsaX_cuda_interface.cu
-  dsaX_cublas_interface.cu
-  dsaX_cuda_handles.cu
-  dsaX_magma_interface.cu
-  dsaX_blas_interface.cpp
-  dsaX_beamformer.cpp
+  cuda_interface.cu
+  cublas_interface.cu
+  cuda_handles.cu
+  magma_interface.cu
+  blas_interface.cpp
+  beamformer.cpp
   dsaX_base.cpp
-  dsaX_correlator.cpp
-  dsaX_interface.cpp
-  dsaX_utils.cpp
-  dsaX_params.cpp
-  dsaX_psrdada_utils.cpp
+  correlator.cpp
+  interface.cpp
+  utils.cpp
+  params.cpp
+  psrdada_utils.cpp
   )
 
 # split source into cu and cpp files
@@ -115,18 +115,3 @@ install(TARGETS
   lib
   )
 #-----------------------------
-
-# install step for executables
-#-----------------------------
-install(TARGETS
-  # cmake-format: sortable
-  #dsaX_beamformer_correlator
-  RUNTIME DESTINATION
-  bin
-  )
-#-----------------------------
-
-if(CUDAToolkit_FOUND)
-  #add_executable(dsaX_beamformer_correlator_exe dsaX_beamformer_correlator_exe.cu)
-  #target_link_libraries(dsaX_beamformer_correlator_exe PUBLIC dsax ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
-endif()
diff --git a/src/dsaX_beamformer.cpp b/src/beamformer.cpp
similarity index 98%
rename from src/dsaX_beamformer.cpp
rename to src/beamformer.cpp
index 2dc5aef..e99a54c 100644
--- a/src/dsaX_beamformer.cpp
+++ b/src/beamformer.cpp
@@ -11,9 +11,9 @@ Workflow is similar for BF and corr applications
 
 #include "dsaX_def.h"
 #include "dsaX.h"
-#include "dsaX_blas_interface.h"
-#include "dsaX_utils.h"
-#include "dsaX_psrdada_utils.h"
+#include "blas_interface.h"
+#include "utils.h"
+#include "psrdada_utils.h"
 
 using namespace std;
 
diff --git a/src/dsaX_correlator.cpp b/src/correlator.cpp
similarity index 98%
rename from src/dsaX_correlator.cpp
rename to src/correlator.cpp
index 4c3fe36..e45595d 100644
--- a/src/dsaX_correlator.cpp
+++ b/src/correlator.cpp
@@ -11,10 +11,10 @@ Workflow is similar for BF and corr applications
 
 #include "dsaX_def.h"
 #include "dsaX.h"
-#include "dsaX_ftd.h"
-#include "dsaX_blas_interface.h"
-#include "dsaX_utils.h"
-#include "dsaX_psrdada_utils.h"
+#include "fast_time_domain.h"
+#include "blas_interface.h"
+#include "utils.h"
+#include "psrdada_utils.h"
 
 using namespace std;
 
diff --git a/src/dsaX_cublas_interface.cu b/src/cublas_interface.cu
similarity index 98%
rename from src/dsaX_cublas_interface.cu
rename to src/cublas_interface.cu
index c528546..234e18a 100644
--- a/src/dsaX_cublas_interface.cu
+++ b/src/cublas_interface.cu
@@ -1,9 +1,9 @@
 #include <iostream>
 
 #include "dsaX.h"
-#include "dsaX_params.h"
-#include "dsaX_cuda_headers.h"
-#include "dsaX_cuda_handles.h"
+#include "params.h"
+#include "cuda_headers.h"
+#include "cuda_handles.h"
 //#include "dsaX_cuda_kernels.h" // For debug
 
 using namespace std;
diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu
new file mode 100644
index 0000000..9b65281
--- /dev/null
+++ b/src/cuda_handles.cu
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <utils.h>
+#include <cuda_handles.h>
+
+using namespace std;
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+
+// CUDA stream handler functions
+//-------------------------
+void init_streams(unsigned int n_streams) {
+
+  if(n_streams < 2 || n_streams > 9) {
+    cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl;
+    exit(0);
+  }
+  
+  if(!stream_init) {
+    streams.reserve(n_streams);
+    for (auto &s : streams) cudaStreamCreate(&s);
+    /*
+      int greatestPriority;
+      int leastPriority;
+    
+      // Query the device to get its built in priority range
+      // For CUDA, lower numerical values indicate higher priority
+      cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
+      for (int i=0; i<Nstream-1; i++) {
+      
+      // Set streams 0 to Nstream-1 to have highest priority
+      cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
+      }
+    
+      // Set stream Nstream - 1 to have lowest priority
+      cudaStreamCreateWithPriority(&streams[Nstream - 1], cudaStreamDefault, leastPriority);
+    */    
+    stream_init = true;
+  }
+}
+
+void destroy_streams() {
+  if (stream_init) {
+    for (auto &s : streams) cudaStreamDestroy(s);
+    stream_init = false;
+  } else {
+    cout << "dsaX Warning: streams not initialized. Please call dsaXInitStreams(n) before destroying streams." << endl;
+  }
+}
+
+cudaStream_t get_stream(unsigned int i) {  
+  if(!stream_init) {
+    cout << "dsaX Error: streams not initialized. Please call dsaXInitStreams(n) before getting stream." << endl;
+    exit(0);
+  }
+  return streams[i];
+}
+
+#else
+
+// Empty error out functions if called from non
+// CUDA terget enabled builds
+void init_streams() cout << "dsaX Error: CUDA target not build" << endl; exit(0);
+void destroy_streams() cout << "dsaX Error: CUDA target not build" << endl; exit(0);
+#endif
diff --git a/src/dsaX_cuda_interface.cu b/src/cuda_interface.cu
similarity index 99%
rename from src/dsaX_cuda_interface.cu
rename to src/cuda_interface.cu
index b8af344..51ee957 100644
--- a/src/dsaX_cuda_interface.cu
+++ b/src/cuda_interface.cu
@@ -1,10 +1,10 @@
 #include <iostream>
 #include <vector>
 
-#include "dsaX_cuda_headers.h"
-#include "dsaX_cuda_interface.h"
-#include "dsaX_cuda_kernels.h"
-#include "dsaX_cuda_handles.h"
+#include "cuda_headers.h"
+#include "cuda_interface.h"
+#include "cuda_kernels.h"
+#include "cuda_handles.h"
 
 using namespace std;
 
diff --git a/src/dsaX_cutlass_interface.cu b/src/cutlass_interface.cu
similarity index 100%
rename from src/dsaX_cutlass_interface.cu
rename to src/cutlass_interface.cu
diff --git a/src/dsaX_blas_interface.cpp b/src/dsaX_blas_interface.cpp
deleted file mode 100644
index 04be79b..0000000
--- a/src/dsaX_blas_interface.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <iostream>
-
-#include "dsaX.h"
-#include "dsaX_cublas_interface.h"
-#include "dsaX_magma_interface.h"
-
-void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) {
-  switch (param.blas_lib) {
-  case DSA_BLAS_LIB_CUBLAS:
-    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
-    break;
-  case DSA_BLAS_LIB_MAGMA:
-    //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
-    break;
-  case DSA_BLAS_LIB_CUTLASS:
-    //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
-    break;
-  case DSA_BLAS_LIB_OPENBLAS:
-    //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
-    break;
-  case DSA_BLAS_LIB_TCC:
-    //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
-    break;
-  default:
-    std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl;
-    exit(0);
-  }
-}
diff --git a/src/dsaX_interface.cpp b/src/interface.cpp
similarity index 97%
rename from src/dsaX_interface.cpp
rename to src/interface.cpp
index e0f294a..31dc832 100644
--- a/src/dsaX_interface.cpp
+++ b/src/interface.cpp
@@ -3,10 +3,10 @@
 #include <cstring>
 #include <string>
 
-#include "dsaX_params.h"
-#include "dsaX_cuda_interface.h"
-#include "dsaX_utils.h"
-#include "dsaX_ftd.h"
+#include "params.h"
+#include "cuda_interface.h"
+#include "utils.h"
+#include "fast_time_domain.h"
 
 using namespace std;
 
diff --git a/src/dsaX_magma_interface.cu b/src/magma_interface.cu
similarity index 86%
rename from src/dsaX_magma_interface.cu
rename to src/magma_interface.cu
index eabfdbf..af91a52 100644
--- a/src/dsaX_magma_interface.cu
+++ b/src/magma_interface.cu
@@ -1,9 +1,9 @@
 #include <iostream>
 
 #include "dsaX.h"
-#include "dsaX_params.h"
-#include "dsaX_cuda_headers.h"
-#include "dsaX_magma_headers.h"
+#include "params.h"
+#include "cuda_headers.h"
+#include "magma_headers.h"
 
 using namespace std;
 
diff --git a/src/dsaX_params.cpp b/src/params.cpp
similarity index 99%
rename from src/dsaX_params.cpp
rename to src/params.cpp
index 4179848..723264c 100644
--- a/src/dsaX_params.cpp
+++ b/src/params.cpp
@@ -1,6 +1,6 @@
 #include <iostream>
 
-#include "dsaX_params.h"
+#include "params.h"
 
 using namespace std;
 
diff --git a/src/dsaX_psrdada_utils.cpp b/src/psrdada_utils.cpp
similarity index 90%
rename from src/dsaX_psrdada_utils.cpp
rename to src/psrdada_utils.cpp
index 07c16e6..3978ecd 100644
--- a/src/dsaX_psrdada_utils.cpp
+++ b/src/psrdada_utils.cpp
@@ -1,4 +1,4 @@
-#include "dsaX_psrdada_utils.h"
+#include "psrdada_utils.h"
 
 void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out)
 {
diff --git a/src/dsaX_utils.cpp b/src/utils.cpp
similarity index 91%
rename from src/dsaX_utils.cpp
rename to src/utils.cpp
index d29e291..cc4194d 100644
--- a/src/dsaX_utils.cpp
+++ b/src/utils.cpp
@@ -1,9 +1,9 @@
 #include <iostream>
 
-#include "dsaX_utils.h"
-#include "dsaX_enums.h"
-#include "dsaX_params.h"
-#include "dsaX_cuda_interface.h"
+#include "utils.h"
+#include "enums.h"
+#include "params.h"
+#include "cuda_interface.h"
 
 using namespace std;
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4a93c15..64aa8db 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,5 +4,5 @@ include_directories(${CLI11_SOURCE_DIR}/include/CLI)
 
 add_library(dsaX_tests command_line_params.cpp)
 
-add_executable(dsaX_correlator_test dsaX_correlator_test.cpp)
-target_link_libraries(dsaX_correlator_test dsaX dsaX_tests)
+add_executable(correlator_test correlator_test.cpp)
+target_link_libraries(correlator_test dsaX dsaX_tests)
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
index 746b4cc..82c02e8 100644
--- a/tests/command_line_params.cpp
+++ b/tests/command_line_params.cpp
@@ -11,10 +11,10 @@ key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h
 // Test params
 bool run_beamformer = false;
 bool run_correlator = false;
-bool input_rands = false;
+bool input_rands = true;
 bool write_output = false;
 int test_iter = 1;
-int n_streams = 10;
+int n_streams = 8;
 
 // Test files
 std::string input_filename = "input.dat";
diff --git a/tests/dsaX_correlator_test.cpp b/tests/correlator_test.cpp
similarity index 53%
rename from tests/dsaX_correlator_test.cpp
rename to tests/correlator_test.cpp
index dfb58f0..6f8d6df 100644
--- a/tests/dsaX_correlator_test.cpp
+++ b/tests/correlator_test.cpp
@@ -8,35 +8,132 @@
 #include <syslog.h>
 #include <random>
 
+using namespace std;
+
 // Include this file to access input parameters
 #include "command_line_params.h"
 
+// Include this file to access test utilities
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr float precision real array
+ * @param[out] ini float precision imag array
+ * @param[in]  input char precision complex array
+ * @param[in]  rows number of rows
+ * @param[in]  cols number of cols
+ */
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) {
+  
+#pragma omp parallel for collapse(2)
+  int idx = 0;
+  for(int i=0; i<rows; i++) {
+    for(int j=0; j<cols; j++) {
+      idx = i * cols + j;
+      
+      // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+      // to get real part 4 bit data.
+      // 0000rrrr
+      // Bit shift this result by 4 to the left.
+      // rrrr0000
+      // Cast to signed char.
+      // +-rrr0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000rrr
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx] = (prec)((char)((   (unsigned char)(input[idx]) & (unsigned char)(15)  ) << 4) >> 4);
+      
+      // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+      // to get imag part 4 bit data
+      // iiii0000.
+      // Cast to signed char
+      // +-iii0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000iii
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx+1] = (prec)((char)((   (unsigned char)(input[idx]) & (unsigned char)(240)  )) >> 4);
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) {
+  
+#pragma omp parallel for collapse(2)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      // Get C index
+      int C_idx_r = 2*(i * n + j);
+      int C_idx_i = 2*(i * n + j) + 1;
+      C[C_idx_r] = 0.0;
+      C[C_idx_i] = 0.0;
+      for(int l=0; l<k; l++) {
+
+	// A is conjugated
+	int A_idx_r = 2*(l * m + i);
+	int A_idx_i = 2*(l * m + i) + 1;
+	
+	int B_idx_r = 2*(l * n + j);
+	int B_idx_i = 2*(l * n + j) + 1;
+
+	// Compute Adag * B = C
+	C[C_idx_r] += A[A_idx_r] * B[B_idx_r] + A[A_idx_i] * B[B_idx_i];
+	C[C_idx_i] += A[A_idx_r] * B[B_idx_i] - A[A_idx_i] * B[B_idx_r];
+      }
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n) {
+
+  prec frob_norm = 0.0;
+  
+#pragma omp parallel for collapse(2) reduction (+:frob_norm)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+
+      // Get Cdag index
+      int Cd_idx_r = 2*(j * m + i);
+      int Cd_idx_i = 2*(j * m + i) + 1;
+      
+      // Get C index
+      int C_idx_r = 2*(i * n + j);
+      int C_idx_i = 2*(i * n + j) + 1;
+
+      double diff = pow((C[C_idx_r] - C[Cd_idx_r]), 2);
+      diff       += pow((C[C_idx_i] + C[Cd_idx_i]), 2);
+      frob_norm = frob_norm + diff;
+    }
+  }
+  return frob_norm/(m*n*2);
+}
+
 // Include the dsaX.h header in your application
 #include <dsaX.h>
 
-using namespace std;
-
 // The class offers entire file content read/write in single operation
-class BinaryFileVector : public std::vector<char>
+class BinaryFileVector : public vector<char>
 {
 public:
 
-  using std::vector<char>::vector;
+  using vector<char>::vector;
 
   bool loadFromFile(const char *fileName) noexcept
   {
     // Try to open a file specified by its name    
-    std::ifstream file(fileName, std::ios::in | std::ios::binary);
+    ifstream file(fileName, ios::in | ios::binary);
     if (!file.is_open() || file.bad())
       return false;
 
     // Clear whitespace removal flag
-    file.unsetf(std::ios::skipws);
+    file.unsetf(ios::skipws);
 
     // Determine size of the file
-    file.seekg(0, std::ios_base::end);
+    file.seekg(0, ios_base::end);
     size_t fileSize = file.tellg();
-    file.seekg(0, std::ios_base::beg);
+    file.seekg(0, ios_base::beg);
 
     // Discard previous vector content
     resize(0);
@@ -48,15 +145,15 @@ class BinaryFileVector : public std::vector<char>
 
     // Read entire file content into prealocated vector memory
     insert(begin(),
-	   std::istream_iterator<char>(file),
-	   std::istream_iterator<char>());
+	   istream_iterator<char>(file),
+	   istream_iterator<char>());
 
     // Make sure entire content is loaded
     if(size() == fileSize) {
-      std::cout << "Successfully read file of size " << fileSize << std::endl;
+      cout << "Successfully read file of size " << fileSize << endl;
       return true;
     } else {
-      std::cout << "Unexpected file size." << std::endl;
+      cout << "Unexpected file size." << endl;
       return false;
     }
   }
@@ -64,7 +161,7 @@ class BinaryFileVector : public std::vector<char>
   bool saveToFile(const char *fileName) const noexcept
   {
     // Write entire vector content into a file specified by its name
-    std::ofstream file(fileName, std::ios::out | std::ios::binary);
+    ofstream file(fileName, ios::out | ios::binary);
     try {
       file.write((const char *) data(), size());
     }
@@ -75,10 +172,10 @@ class BinaryFileVector : public std::vector<char>
     // Determine number of bytes successfully stored in file
     size_t fileSize = file.tellp();
     if(size() == fileSize) {
-      std::cout << "Successfully wrote file of size " << fileSize  << std::endl;
+      cout << "Successfully wrote file of size " << fileSize  << endl;
       return true;
     } else {
-      std::cout << "Unexpected file size." << std::endl;
+      cout << "Unexpected file size." << endl;
       return false;
     }
   }
@@ -102,13 +199,13 @@ int main(int argc, char **argv) {
   uint64_t sz, in_block_size, rd_size;
   in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
   
-  std::cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << std::endl;
+  cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << endl;
   char *file_data = (char *)malloc(in_block_size);  
 
   // read one block of input data  
   // get size of file
   if(!input_rands) {
-    std::cout << "attempting to read file " << input_filename.c_str() << std::endl; 
+    cout << "attempting to read file " << input_filename.c_str() << endl; 
     fin = fopen(input_filename.c_str(), "rb");
     fseek(fin, 0L, SEEK_END);
     sz = ftell(fin);
@@ -140,10 +237,10 @@ int main(int argc, char **argv) {
     int n_rand = in_block_size/sizeof(uint64_t);
     uint64_t *input_rand = (uint64_t*)malloc(n_rand);
 
-    std::random_device rd;
-    std::mt19937_64 gen(rd());
+    random_device rd;
+    mt19937_64 gen(rd());
     gen.seed(1234);
-    std::uniform_int_distribution<uint64_t> dis;
+    uniform_int_distribution<uint64_t> dis;
     for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen);
     //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234;
     memcpy(file_data, (void*)input_rand, n_rand);
@@ -152,7 +249,7 @@ int main(int argc, char **argv) {
   
   // Start dsaX program
   //---------------------------------------
-  timer::Timer<std::chrono::microseconds, std::chrono::high_resolution_clock> test_timer;
+  timer::Timer<chrono::microseconds, chrono::high_resolution_clock> test_timer;
 
   dsaXInit(device_ordinal);
   
@@ -168,7 +265,7 @@ int main(int argc, char **argv) {
 
   // Create GPU registered memory if using CUDA 
   uint64_t input_size = n_streams*sizeof(char)*in_block_size;
-  std::cout << "Creating char input array of size " << input_size << " bytes." << std::endl;
+  cout << "Creating char input array of size " << input_size << " bytes." << endl;
   void *input_data = dsaXHostRegister(input_size);
   // Populate with random data. Each stream has the same data
   // To ensure the concurrency does not pollute accross streams. 
@@ -176,20 +273,33 @@ int main(int argc, char **argv) {
 
   // Create GPU registered output array
   uint64_t output_size = n_streams * sizeof(float) * NBASE*NCHAN_PER_PACKET*2*2;
-  std::cout << "Creating float output_array of size " << output_size << " bytes." << std::endl;
+  cout << "Creating float output_array of size " << output_size << " bytes." << endl;
   void *output_data = dsaXHostRegister(output_size);
 
+  /*
+  float *A = (float*)dsaXHostRegister(2*sizeof(float)*96*512);
+  float *B = (float*)dsaXHostRegister(2*sizeof(float)*96*512);
+  float *C = (float*)dsaXHostRegister(2*sizeof(float)*96*96);
+  promoteComplexCharToFloat(A, file_data, 512, 96);
+  promoteComplexCharToFloat(B, file_data, 512, 96);  
+  host_MdagM_gemm(A, B, C, 96, 96, 512); 
+  */
+    
   // Ensure test output array is zero
   memset(output_data, 0, output_size);
   
-  std::cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl;
-  std::cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl;
+  cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl;
+  cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl;
   
   test_timer.start();  
   correlator->compute(output_data, input_data);
   test_timer.stop();
+
+  float frob_norm = test_hermiticity((float*)output_data, 96, 96);
+  cout << "Frobenius norm = " << frob_norm << endl;
+
   
-  //std::cout << "Output peek " << std::endl;
+  //cout << "Output peek " << endl;
   float *p = (float*)output_data;
   for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl;
   
@@ -202,7 +312,7 @@ int main(int argc, char **argv) {
   delete correlator;
   dsaXEnd();
 
-  std::cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl;
+  cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl;
   
   // End dsaX program
   //---------------------------------------
@@ -218,7 +328,7 @@ int main(int argc, char **argv) {
 
   
   if (!binaryFileVector.loadFromFile(test_filename.c_str())) {
-    std::cout << "Failed to read the file." << std::endl;
+    cout << "Failed to read the file." << endl;
     return 0;
   }
   
@@ -237,14 +347,14 @@ int main(int argc, char **argv) {
   for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);  
 
   // Peek at output data (delete after development is complete)
-  for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) std::cout << "output " << i << " = " << output_data[i] << std::endl;
-  //for (int i=0; i<8; i++) std::cout << "output " << i << " = " << output_data[i] << std::endl; 
+  for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) cout << "output " << i << " = " << output_data[i] << endl;
+  //for (int i=0; i<8; i++) cout << "output " << i << " = " << output_data[i] << endl; 
 
   if (!binaryFileVector.saveToFile("output.dat")) {
-    std::cout << "Failed to write a file." << std::endl;
+    cout << "Failed to write a file." << endl;
     return 0;
   } else {
-    std::cout << "Successfully wrote file." << std::endl;
+    cout << "Successfully wrote file." << endl;
   }
   
   
From c422a86887e4a87aafa52d7a4a2a2d6358565e9d Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Thu, 4 Jul 2024 21:46:43 -0700
Subject: [PATCH 29/30] Add untracked files

---
 include/dsaX.h                |   3 +-
 include/timer.h               |  85 +++++++++++++
 src/blas_interface.cpp        |  28 +++++
 src/dsaX_base.cpp             |   9 ++
 tests/utils.cpp               |  89 ++++++++++++++
 tests/utils.h                 |   5 +
 tests/utils/.gitignore        |   2 -
 tests/utils/CMakeLists.txt    |  11 --
 tests/utils/CMakeLists.txt~   |  22 ----
 tests/utils/gen_packet.py     | 216 ----------------------------------
 tests/utils/get_rms.py        | 141 ----------------------
 tests/utils/get_rms_packet.py |  36 ------
 tests/utils/packet.out        | Bin 4608 -> 0 bytes
 tests/utils/sockets.py        |  31 -----
 tests/utils/test.out          | Bin 196608 -> 0 bytes
 15 files changed, 218 insertions(+), 460 deletions(-)
 create mode 100644 include/timer.h
 create mode 100644 src/blas_interface.cpp
 create mode 100644 src/dsaX_base.cpp
 create mode 100644 tests/utils.cpp
 create mode 100644 tests/utils.h
 delete mode 100644 tests/utils/.gitignore
 delete mode 100644 tests/utils/CMakeLists.txt
 delete mode 100644 tests/utils/CMakeLists.txt~
 delete mode 100644 tests/utils/gen_packet.py
 delete mode 100644 tests/utils/get_rms.py
 delete mode 100644 tests/utils/get_rms_packet.py
 delete mode 100644 tests/utils/packet.out
 delete mode 100644 tests/utils/sockets.py
 delete mode 100644 tests/utils/test.out

diff --git a/include/dsaX.h b/include/dsaX.h
index 8aff8c5..ff2772c 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -18,9 +18,10 @@
 #define sep 1.0 // arcmin
 
 void dsaXInit(int device_ordinal = -1);
+
 void dsaXEnd();
 
-//void dsaX
+
 
 void *dsaXHostRegister(size_t size);
 
diff --git a/include/timer.h b/include/timer.h
new file mode 100644
index 0000000..6607d5d
--- /dev/null
+++ b/include/timer.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2023 by Mark Melton
+//
+
+#pragma once
+#include <atomic>
+#include <chrono>
+
+namespace timer {
+
+  template <typename T>
+  inline void doNotOptimizeAway(const T& val) {
+    asm volatile("" : : "r,m"(val) : "memory");
+  }
+
+#ifdef __clang__
+  template<class T>
+  inline void doNotOptimizeAway(T& value) {
+    asm volatile("" : "+r,m"(value) : : "memory");
+  }
+#else
+  template<class T>
+  inline void doNotOptimizeAway(T& value) {
+    asm volatile("" : "+m,r"(value) : : "memory");
+  }
+#endif
+
+  inline void doNotReorderBarrier() {
+    std::atomic_signal_fence(std::memory_order_acq_rel);
+  }
+
+  /// The Timer class template implements a timer designed for minimal
+  /// overhead, ad-hoc timing of code regions including micro-timing
+  /// down to single machine instructions.
+  template<class Duration = std::chrono::nanoseconds,
+	   class Clock = std::chrono::high_resolution_clock>
+  class Timer {
+  public:
+    using TimePoint = typename Clock::time_point;
+
+    /// Run the supplied `code` in a loop `n` times.
+    template<class Code>
+    Timer& run(size_t n, Code&& code) {
+      start();
+      for (auto i = 0ul; i < n; ++i) {
+	code();
+      }
+      stop(n);
+      return *this;
+    }
+
+    /// Start the timer.
+    void start() {
+      start_ = Clock::now();
+    }
+
+    /// Stop the timer indicating `n` operations.
+    auto stop(size_t n = 1) {
+      auto end = Clock::now();
+      iterations_ += n;
+      elapsed_ += std::chrono::duration_cast<Duration>(end - start_);
+      return elapsed_;
+    }
+
+    /// Return the average number of nanoseconds per operation.
+    auto elapsed_per_iteration() const {
+      return iterations_ > 0 ? (double)elapsed_.count() / iterations_ : 0.0;
+    }
+
+    /// Return the elapsed duration.
+    auto elapsed() const {
+      return elapsed_;
+    }
+
+    /// Return the iterations.
+    auto iterations() const {
+      return iterations_;
+    }
+    
+  private:
+    TimePoint start_{};
+    Duration elapsed_{};
+    size_t iterations_{};
+  };
+
+}; // timer
diff --git a/src/blas_interface.cpp b/src/blas_interface.cpp
new file mode 100644
index 0000000..ed76f05
--- /dev/null
+++ b/src/blas_interface.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+
+#include "dsaX.h"
+#include "cublas_interface.h"
+#include "magma_interface.h"
+
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) {
+  switch (param.blas_lib) {
+  case DSA_BLAS_LIB_CUBLAS:
+    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
+    break;
+  case DSA_BLAS_LIB_MAGMA:
+    //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
+    break;
+  case DSA_BLAS_LIB_CUTLASS:
+    //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_OPENBLAS:
+    //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_TCC:
+    //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  default:
+    std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl;
+    exit(0);
+  }
+}
diff --git a/src/dsaX_base.cpp b/src/dsaX_base.cpp
new file mode 100644
index 0000000..80a947a
--- /dev/null
+++ b/src/dsaX_base.cpp
@@ -0,0 +1,9 @@
+#include "fast_time_domain.h"
+
+dsaXBase::dsaXBase() {
+  
+}
+
+dsaXBase::~dsaXBase() {
+  
+}
diff --git a/tests/utils.cpp b/tests/utils.cpp
new file mode 100644
index 0000000..bc10104
--- /dev/null
+++ b/tests/utils.cpp
@@ -0,0 +1,89 @@
+#include "utils.h"
+
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr float precision real array
+ * @param[out] ini float precision imag array
+ * @param[in]  input char precision complex array
+ * @param[in]  rows number of rows
+ * @param[in]  cols number of cols
+ */
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) {
+  
+#pragma omp parallel for collapse(2)
+  int idx = 0;
+  for(int i=0; i<cols; i++) {
+    for(int j=0; j<rows; j++) {
+      int idx = i * rows + j;
+      
+      // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+      // to get real part 4 bit data.
+      // 0000rrrr
+      // Bit shift this result by 4 to the left.
+      // rrrr0000
+      // Cast to signed char.
+      // +-rrr0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000rrr
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx] = (prec)((char)((   (unsigned char)(input[2*idx]) & (unsigned char)(15)  ) << 4) >> 4);
+      
+      // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+      // to get imag part 4 bit data
+      // iiii0000.
+      // Cast to signed char
+      // +-iii0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000iii
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx+1] = (prec)((char)((   (unsigned char)(input[2*idx+1]) & (unsigned char)(240)  )) >> 4);
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) {
+  
+#pragma omp parallel for collapse(2)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      // Get C index
+      int C_idx = i * n + j;
+      C[2*C_idx]   = 0.0;
+      C[2*C_idx+1] = 0.0;
+      for(int l=0; l<k; l++) {
+	
+	int A_idx = l + m + i;
+	int B_idx = l * n + j;
+
+	// Compute Adag * B = C
+	C[2*C_idx]   += A[2*A_idx] * B[2*B_idx] + A[2*A_idx+1] * B[2*B_idx+1];
+	C[2*C_idx+1] += A[2*A_idx] * B[2*B_idx+1] - A[2*A_idx+1] * B[2*B_idx];
+      }
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n) {
+
+  prec frob_norm = 0.0;
+  
+#pragma omp parallel for collapse(2) reduction (+:frob_norm)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      int C_idx  = i + n * j;
+      int Cd_idx = j + m * i;
+
+      double diff = pow((C[2*C_idx] - C[2*Cd_idx]), 2);
+      diff       += pow((C[2*C_idx+1] - C[2*Cd_idx+1]), 2);
+      frob_norm = frob_norm + diff;
+      
+ 
+    }
+  }
+  return frob_norm;
+}
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..354e196
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,5 @@
+#pragma once
+
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols);
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k);
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n);
diff --git a/tests/utils/.gitignore b/tests/utils/.gitignore
deleted file mode 100644
index dafcc02..0000000
--- a/tests/utils/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-antennas.out
-gen_antennas.py
diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
deleted file mode 100644
index 226c9de..0000000
--- a/tests/utils/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# install step for utils
-#------------------------------
-set(DSA_XENGINE_UTILS
-  # cmake-format: sortable
-  gen_packet.py
-  get_rms_packet.py
-  get_rms.py
-  sockets.py
-  )
-install(FILES ${DSA_XENGINE_UTILS} DESTINATION utils)
-#------------------------------
diff --git a/tests/utils/CMakeLists.txt~ b/tests/utils/CMakeLists.txt~
deleted file mode 100644
index ab053c5..0000000
--- a/tests/utils/CMakeLists.txt~
+++ /dev/null
@@ -1,22 +0,0 @@
-# install step for utils
-#------------------------------
-set(DSA_XENGINE_UTILS
-  # cmake-format: sortable
-/home/dmhowart/DSA110/dsa110-xengine/src/dsaX_bfCorr.cu  dsaX_capture.h
-  dsaX_capture_manythread.h
-  dsaX_capture_pcap.h
-  dsaX_def.h
-  dsaX_cutlass_interface.h
-  )
-install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
-#------------------------------
-
-# install step for executables
-#-----------------------------
-install(TARGETS
-  # cmake-format: sortable
-  dsaX_bfCorr
-  RUNTIME DESTINATION
-  bin
-  )
-#-----------------------------
diff --git a/tests/utils/gen_packet.py b/tests/utils/gen_packet.py
deleted file mode 100644
index 2ae1bee..0000000
--- a/tests/utils/gen_packet.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import numpy as np, struct
-import matplotlib.pyplot as plt
-
-
-''' The aim here is to make two types of data packets: 
- - one with a tone at a particular frequency and set of antennas
- - one with pure noise 
-
-Structure is 3 ant, 384 chan, 2 time, 2 pol, r/i
-4608 bytes long
-
-'''
-
-
-def make_spectrum(packet,ant=0,pol=0):
-
-    spec = np.zeros(384*2)
-    
-    d = np.asarray(struct.unpack('>4608B',packet))
-
-    # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
-    d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
-
-    d_r = ((d & 15) << 4)
-    d_i = d & 240
-    d_r = d_r.astype(np.int8)/16
-    d_i = d_i.astype(np.int8)/16     
-        
-    spec += d_r**2.+d_i**2.
-    spec = spec.reshape((384,2)).mean(axis=1)
-    return(spec)
-
-def plot_spectrum(data,ant=0,pol=0):
-
-    spec = make_spectrum(data,ant=ant,pol=pol)
-    plt.plot(spec)
-    plt.xlabel('Channel')
-    plt.ylabel('Power')
-    plt.show()
-
-def make_histogram(packet):
-    ''' Makes histogram of packet - tested 
-    '''
-    
-    histo = np.zeros(16)
-    rms = 0.
-                
-    d = np.asarray(struct.unpack('>4608B',packet))
-    
-    # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
-    d = (d.reshape((3,384,2,2))).ravel()
-    
-    d_r = ((d & 15) << 4)
-    d_i = d & 240
-    d_r = d_r.astype(np.int8)/16
-    d_i = d_i.astype(np.int8)/16        
-    
-    rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.)
-
-    hx = np.arange(16)-8
-    
-    for i in range(384*2):
-        
-        histo[int(d_r[i])+8] += 1.
-        histo[int(d_i[i])+8] += 1.
-            
-    return(hx,histo/np.max(histo),np.sqrt(rms))
-
-def histo_test(data):
-
-    hx,histo,rms = make_histogram(data)
-    print('HISTOGRAM: ')
-    for i in range(16):
-        print(hx[i],histo[i])
-    print()
-    print('RMS = ',rms)
-    print()
-
-
-########## MAIN ############
-
-# defaults
-outfile = 'packet.out'
-n_packet = 4608 # 4608 for single packet
-
-# decide which sort of packet to make
-noise = True
-tone = False
-x16 = False
-
-# if tone
-if tone is True:
-
-    # defaults:
-    chans = np.arange(384)#np.asarray([10,100,190])
-    #ant = 1
-    amp_A = 9.0
-    amp_B = 4.
-
-    # derived quantities
-    amp_A = 16.*np.sqrt(amp_A)
-    amp_B = 16.*np.sqrt(amp_B)
-    ph = 2.*np.pi*np.random.uniform()
-    ramp_A = amp_A*np.cos(ph)
-    iamp_A = amp_A*np.sin(ph)
-    ph = 2.*np.pi*np.random.uniform()
-    ramp_B = amp_B*np.cos(ph)
-    iamp_B = amp_B*np.sin(ph)
-    
-    # make packet
-    real_part = np.zeros(n_packet,dtype='int8')
-    imag_part = np.zeros(n_packet,dtype='int8')
-    for ant in [0,1,2]:
-        for i in chans:
-
-            # time 1 pol A
-            j = int(1536*ant + i*4)
-            real_part[j] = round(ramp_A)
-            imag_part[j] = round(iamp_A)
-            
-            # time 1 pol B
-            j = int(1536*ant + i*4 + 1)
-            real_part[j] = round(ramp_B)
-            imag_part[j] = round(iamp_B)
-            
-            # time 2 pol A
-            j = int(1536*ant + i*4 + 2)
-            real_part[j] = round(ramp_A)
-            imag_part[j] = round(iamp_A)
-
-            # time 2 pol B
-            j = int(1536*ant + i*4 + 3)
-            real_part[j] = round(ramp_B)
-            imag_part[j] = round(iamp_B)
-
-        
-    # make 4-bit versions
-    real_part = np.cast['uint8'](real_part)
-    imag_part = np.cast['uint8'](imag_part)
-    for i in range(n_packet):
-        real_part[i]  = real_part[i] >> 4
-        imag_part[i]  = (imag_part[i] >> 4) << 4
-
-    # finish packet
-    packet = np.zeros(n_packet,dtype='uint8')
-    for i in range(n_packet):
-        packet[i] = real_part[i] | imag_part[i]
-
-    # if x16
-    if (x16):
-
-        p2 = np.zeros(21*n_packet,dtype='uint8')
-        for i in range(21):
-            p2[i*n_packet:(i+1)*n_packet] = packet
-    
-        out_str = p2.tobytes()
-
-    else:
-
-        out_str = packet.tobytes()
-    
-# if noise
-if noise is True:
-
-    # defaults
-    rms = 1.5 # 4-bit
-    erms = rms*16.
-
-    # make real and imag parts
-    real_part = np.zeros(n_packet,dtype='int8')
-    imag_part = np.zeros(n_packet,dtype='int8')
-
-    for ant in [0, 1, 2]:
-        for i in np.arange(384):
-
-            # time 1 pol A
-            j = int(1536*ant + i*4)
-            real_part[j] = round(np.random.normal()*erms)
-            imag_part[j] = round(np.random.normal()*erms)
-            
-            # time 1 pol B
-            j = int(1536*ant + i*4 + 1)
-            real_part[j] = round(np.random.normal()*erms)
-            imag_part[j] = round(np.random.normal()*erms)
-            
-            # time 2 pol A
-            j = int(1536*ant + i*4 + 2)
-            real_part[j] = round(np.random.normal()*erms)
-            imag_part[j] = round(np.random.normal()*erms)
-
-            # time 2 pol B
-            j = int(1536*ant + i*4 + 3)
-            real_part[j] = round(np.random.normal()*erms)
-            imag_part[j] = round(np.random.normal()*erms)
-
-    # make 4-bit versions
-    real_part = np.cast['uint8'](real_part)
-    imag_part = np.cast['uint8'](imag_part)
-    for i in range(n_packet):
-        real_part[i]  = real_part[i] >> 4
-        imag_part[i]  = (imag_part[i] >> 4) << 4
-
-    # finish packet
-    packet = np.zeros(n_packet,dtype='uint8')
-    for i in range(n_packet):
-        packet[i] = real_part[i] | imag_part[i]
-
-    out_str = packet.tobytes()
-
-
-newFile = open(outfile, "wb")
-newFile.write(out_str)
-newFile.close()
-
-    
-#plot_spectrum(out_str,pol=1,ant=1)
diff --git a/tests/utils/get_rms.py b/tests/utils/get_rms.py
deleted file mode 100644
index 8854a36..0000000
--- a/tests/utils/get_rms.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import numpy as np
-import sockets as s
-import struct
-import sys
-import matplotlib.pyplot as plt
-
-# for file writing
-
-def write_bin(data,fl='test.dat'):
-
-        f = open(fl,'w+b')
-        for packet in data:
-                d = bytearray(np.asarray(struct.unpack('>4616B',packet))[8:].astype(np.int8))
-                print(len(d))
-                f.write(d)
-
-        f.close()
-        
-
-# for making histogram of input
-
-def make_histogram(data,ant=0,pol=0):
-
-        histo = np.zeros(16)
-        rms = 0.
-        
-        for packet in data:
-                
-                d = np.asarray(struct.unpack('>4616B',packet))[8:]
-                
-                # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
-                d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
-                
-                d_r = ((d & 15) << 4)
-                d_i = d & 240
-                d_r = d_r.astype(np.int8)/16
-                d_i = d_i.astype(np.int8)/16        
-
-                rms += 0.5*(np.std(d_r)**2.+np.std(d_i)**2.)
-                
-                for i in range(384*2):
-
-                        histo[int(d_r[i])+8] += 1.
-                        histo[int(d_i[i])+8] += 1.
-                                                                        
-        return histo/np.max(histo),np.sqrt(rms)
-        
-# for making spectrum from data
-def decode_data(data,ant=0,pol=0):
-
-    spec = np.zeros(384*2)
-    
-    for packet in data:
-
-        d = np.asarray(struct.unpack('>4616B',packet))[8:]
-
-        # order is 3 antennas x 384 channels x 2 times x 2 pols x real/imag, with every 8 flipped
-        d = (d.reshape((3,384,2,2)))[ant,:,:,pol].ravel()
-
-        d_r = ((d & 15) << 4)
-        d_i = d & 240
-        d_r = d_r.astype(np.int8)/16
-        d_i = d_i.astype(np.int8)/16     
-        
-        spec += d_r**2.+d_i**2.
-
-    spec = spec.reshape((384,2)).mean(axis=1)
-    return(spec)
-
-# for decoding packets
-def decode_header(data):
-
-    min_s = 10000
-    max_s = 0
-        
-    for packet in data:
-
-        d = np.asarray(struct.unpack('>4616B',packet))
-
-        # packet id
-        p = 0
-        p = p | ((d[4] & 224) >> 5)
-        p = p | (d[3] << 3)
-        p = p | (d[2] << 11)
-        p = p | (d[1] << 19)
-        p = p | (d[0] << 27)
-        
-        # spectrum id
-        sp = 0
-        sp = sp | ((d[4] & 31) << 8)
-        sp = sp | d[5]
-
-        if (sp<min_s):
-                min_s = sp
-        if (sp>max_s):
-                max_s = sp
-    
-        print(p,sp)
-
-    print(min_s,max_s)
-
-# MAIN
-
-n = 10000
-ip = '10.41.0.62'
-port=4011
-data = s.capture(ip=ip,port=port,n=n)
-ant=0
-pol=0
-
-#decode_header(data)
-
-histo,rms = make_histogram(data,ant=ant,pol=pol)
-print()
-print('RMS:',rms/np.sqrt(1.*n))
-for i in np.arange(16):
-    print(histo[i],'  ',)
-
-sys.exit()
-    
-spec = decode_data(data,ant=ant,pol=pol)
-spec = np.sqrt(spec/n/2.)
-print()
-print('Have spectral points',len(spec))
-print()
-#for i in np.arange(len(spec)):
-#    print(spec[i],'  ',)
-
-plt.plot(spec)
-plt.show()
-
-
-
-
-
-    
-
-
-    
-
-    
diff --git a/tests/utils/get_rms_packet.py b/tests/utils/get_rms_packet.py
deleted file mode 100644
index f75d278..0000000
--- a/tests/utils/get_rms_packet.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import socket, numpy as np
-from progress.bar import Bar
-import sockets as s
-import struct
-import sys
-import matplotlib.pyplot as plt
-
-# ip as string, port as int, buf as int
-def capture(n=100,ip=None,port=None,buf=4616):
-
-        if ip is None:
-            print('No IP')
-            return()
-
-        if port is None:
-            print('No port')
-            return()
-
-        sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
-        sock.bind((ip,port))
-
-        captured=0
-        packs = []
-        bar = Bar('Capturing '+str(n)+' packets...', max=n)
-        while captured<n:
-            
-            data, addr = sock.recvfrom(buf)
-            packs.append(data)
-            captured += 1
-            bar.next()
-            
-        bar.finish()
-                                                                                                            
-    return(packs)
-
-
diff --git a/tests/utils/packet.out b/tests/utils/packet.out
deleted file mode 100644
index 34e6909992a277b32cd475dc5c7f9f04da910749..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4608
zcmW+)&v)E5k}go*Q$+me%x&EO*`C{4r0iT1wY}%v&3pTodiT8ZA~|PEWarH(RuiCd
z8#hp3=VlSL<vDdfu?ZAVUwuCWE*0v0@BHjc!5nlz*{lKasl@<AnX%mn=g?X=<Tw~T
z#YG9+I+@!fbx`F0rzk6<o+OnbN^+yS=`}=QzAo#KL-N7_<heT0q3Ysf5A$4fHp={o
z=TSK=C@TIuK{1PV(WCWyo!ylLqv7$o1G$SNh6)A}u!SB|?t>e4S(FWz+KX5=k0#xq
zA)|sjI#5k2{iJ=yq++YCR0dKIVIz(?m~kZogI6z&!a&zbZ_VX!pkwSO$i?_`U7D?Q
zO6%YMTb-6B32}ihoNJcxl&B%ce@x}Pn3oTga+5A|*b1zJnj55|on7qw&Rlh;Q+HJR
zojoghQ;sU#$lqxTlvKGwe5ocJ$&$6|KKn7<YdYEocjHc1r8(B#xeF0zKG5XESEYpC
zxVPh5EdnV2QaMNiV!?}PtaGvADD%IAD|lVVr?=-J8ei8cm)^{satahGebjYQ|N9vf
z2%`H<wynxl;c7IdO(#tOaYC<~Cll8rWV?8hUhVnXrp(V~-!I&%gC@1Y)CMd3w5W*O
zN$HY8@nY2_32=2SNKM@yh0aa>C2Ev|MHx<4V3Hm{h)y?Kp&FueZL=#d6AId+dFrQ8
zqqFDwx?}{*DktmHlwLSX2+;Uv#8DNQJSg+ppHWyp5~0FbpJ-B|PZ`6Sq6S4!G;e&;
z%I6@`3L@Z9Ku)5xd#t9$8d7nDfc>{`Wn;lZpMP{kOx+Rg^2#-<E$V%pI)k-t;xOs1
z5anKBKR7Y{%-_9PHt5y{K#H8StlPmiH$GkYpn*V-h{)rT`RcrmNxc3MljNS%{jza?
zcq2%2^R!Csq_kM|m!pNSb_FNl)Y|JHms8ee?RP@$wWkk#u9{>?B9ULP@JaQrUixR@
zm*7p~f58eQ!P$b$i0<3AVzae6Lwnb(@msxf8IRD2h1yf!M{7jvbKxdZc<)l`BuW*g
zbI!YGo$jxRmdi|z4E@nK6&@2}1PD#HT>u$bM7p?pWeWdBk3s-hFizHtv#;2M29B3^
zdLumMp-qjmsfeB7H=EQ;{MxzpUq|rzWS#JQxu9!whL@BY89ELG(&uVN>)UU<kT=AW
zixK6clzhf3^=FACf?rqXmI5eAYYegoZGaj{Ls;y?Jls&u@3bMFSKCPS7WK~<{{@Dn
z7?izr2wJYK!qPZwzW)v+oaiLAC{C<KrIT5*M1LtQXolWqFRTjU7L*Vh)pv=ha8ZkS
zV7km)aK?>?_sXH~C6B5@Wmki{`lB=a#w}MMYrezVvywXohr0YSlqqEe^+u@ZYMwlr
zrLa>0TXH_5Q-bmQR1Mw`(;pb+o@^BrxaAsMr+ej*0IT0I=E3kv5}`Q4RfCIm^rR=F
zm9;x7Rgz!_9%$%%hp<kveyQP&>eF?dd6sF6@?*l4<T!t$|I=rvqpFxdmc$?_!(?My
zRhO{s6b>**itB0%sN3O<7lY&tzQ}WBRqmN{YVYqKsJysaXubTB_Q4v7m^U&+8pNV$
zP^6s6Nl|Mpx5tZG4hrPa9(W9~;VFhqes6H9;<AD+eDec7SHGA+@lgzqzACEql~9cK
zpZt+&yHYdRHP_C#c^W`33XIH)(Ym?))f`Z7@{$u|NXpze8B1kxJ#N(@AD@z_7^}ir
zW=#iH>M!AK7NRNFg<DgA`c(2xiDD&Y?TBROL?xW0jt|Uxwc)WkiTgK(68SIkLXVQM
zx6KjCIzRmvKI5uRFdh9zNZBCiel0fLdRJUFzGCJ~lC@{mS}#r?$9E_frCD>GEzbBN
zJqa<(e#qauBg>pf*zqtKVRzIjn_N-21+KqiP7=5MO<?3=a2~K;^&&2gXt0>3x#{=A
zBFmyHz$CPaN;S66>70Al$)AoU)R;#JmHs<Vz+s_AGuvcmEG;;t%4VcM<3Czis8?y+
zxYH_P?vIU!l+L;rv~~<pATRmF8oJdsNm|=VAH<0lN`6ShO%R_YIUXX}xKi#I1oe^i
zKy6*WGJ__j^5dBF)xo=@A(Q6=3ud^79OU(+<iZplo0?jYb6xJt_Iwr#L{`V3PE0_v
zSeR&(4!KJ^W_q05a+=(&*8}t1MT~uvx-9>ui($&nILH$?buqVhIe=FurX=I93-)gv
zmZty*?!M^l5q8c2@uf_>mGQGa9<+GLcag^dPi~uf)xQtU5t#b-TsEj-5a;aw3F^9Y
zN*V!;-b*N?RGD%_BjM;+*LfzAR1e`2Gl@ynu0h2p^5w;<G$WXvbH~c%-V1d#sS;9c
z7C&TOrIZ$IJ&dmgc4C*JRWAm?lRGtYC(kz2(lO1SXcId|q-GQ3G&_nfMYpW0MH<TT
zLt0LHSGFs8*t79ickVN)4J%2$OZZS>m7pXD44loY(_cxOz|RU^bv=!=1iR1G`V6vm
z2~t@iZKr%yms;WZx^&)@yAa6sl40)fL({ri$KqaoQb%OTqPQ3qu;--h-!g-wwM5nK
z2dr{Y`?|4^$eFEFzT55DbdsxlFb+Fx2RQ6;d6y0uTe;22_ZZBo>dLX?PYy?8wNK}u
z@8z|Zte$RmzGIRn+A*W5)rywggL{pvumhVI5spX^x`C$@_>eRAB?clUc)bv}^Z0qd
zXtZp*CKg?g<x0G4)Xz<}$+S2>U*~&@PNRXozbi2CFjT+dhn_hDhGF=d{7$pHZPXAb
zZ0pNh>m{IaO~O1$o_a+Ue-w6eIWtL8UJF~S_AUV6IfhYi$M3bPWH7N8Exfr?pNM}~
z|HJFB%Ds3s>$tCZ4Ytx|krLBMQ2O=WxZ_kRK|A4u{*8QHa23ftgYFBv#r(8nIL++s
zdo5gT$4_ryKhhjce)8-Zfsf5H-z^Nr9~rMM=TWnfq)R;E1e?RH=u})hm?5$KTaV^h
z3+Ctz-!%bvHeB_Z|If*S#S<?!vTZJUyGVY>Ruc6k3@r5xC8peTmD2N_eYhrWVwRG{
z=06&j)ui)8%mjG>|4QP)QPs9(cvwQtpPbk~eLG5Fl=pJ+ABANjW?_DuWWa_+8(8+g
z=re@+fXkS#@@r)$c8c(ybo)4q^X#1l`SkiP2H*HJa_<DD$I~M(;$P_>w%rE4vrTPE
zP4X()N|>5|#~QdcL$u`YuhUvxkv<Pg2zl?CW9P}AU^BCrv(W|>Jx49WRmzQ`om9;e
zq#w5%v9rJmUT1#*p6PR8<ow25AxDPK&6$;l$$?p&O%{9Mi8Df<ynH7pa4?X=GROnZ
zN1ZwIh;(K)$=U9USQGNjWs><arfJ`d7ALgC0z$9D2<=!3O~ZjT9(o?EsqO->z%kc#
z!$CuJTr!>%n(KXWo;!hYD=!rsZzr;1eweo#D!w<OK19~@yBi)B7y@G$Ow$%*eP?UW
zLNC4*rpAcxU>R4%p6?^u$mx48w=sACQQxXfW^cwa>~E#w7$H=ORS0+sIwL!cznp4U
z;%2mR<SgpznclCYegnm4nK=hB%DNwdSM!pdP`A1C?^GjLBo$NAV%;){_+Cdwz02J@
zHs6*@bsHH^)DcS$ThG0A=9K)OU_ATU+(gmnYQGdtJ+f?PwxLPn$LEKx?m5FS$#bq_
z&rqd6xGT3%8ICAadt_1*m+w3$qy{ZOKMBJeCK@J#w5)7;n{8nFBz(Q$L<~`G81lJN
zRIWInCG&B|OI-UqccX6Q#=9YiR<Nh>CNT-XU<3|f@bXAQZ#fv<ebbV(utj`Bac@5|
zbz}|<nk^kJu@xM5XPe{~a~erbD!Eg6X<7EBZ;1&wsK5L^FX&@nug}!wLE@(Kj)eMN
z2nE}FX4P~RZY0ppT&9q)=A*b{)NIv7!$7KWF$164Ulb(g^C|}y-Ggvi>eu?@D6O%&
zyk4qS%+`mMg4J&I^(Zu3Lw2AKTu7%w%#Nv%DM$*1-n}3716if`Uj*G((9nFEf-ZEp
zDPo|V#tT76e-x7S1NGWn8@}WFfn8$D^swxC+^VX_V(HzhlG>YJ7=y?%Ahh@KEN8aU
zP2@D@FVrfzrmWa*<=ThDE&$SBsU6v!{nMDS*1;0x8<7&4M?kiW4(|1Q(WpL5D!?WC
zsW7Xa^LP#I27a>2n21dIARbD1PRu3=qjYtL+_9zEuy<v9<tjTKm}hz&=@D{1vfq_^
z&*4J!S$$wndJhZtDrdCp_c_w})Nlx{IWF*PwFGqj1~h)sH_4V+>$qaext-6DQS6JP
Q(8{YQee{VQIaCb)58tM{bpQYW

diff --git a/tests/utils/sockets.py b/tests/utils/sockets.py
deleted file mode 100644
index aaff3f7..0000000
--- a/tests/utils/sockets.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import socket, numpy as np
-
-# ip as string, port as int, buf as int
-def capture(n=100,ip=None,port=None,buf=4616):
-
-    if ip is None:
-        print('No IP')
-        return()
-
-    if port is None:
-        print('No port')
-        return()
-
-    sock = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
-    sock.bind((ip,port))
-
-    captured=0
-    packs = []
-    while captured<n:
-
-        data, addr = sock.recvfrom(buf)
-        packs.append(data)
-        captured += 1
-
-
-    return(packs)
-
-
-
-        
-        
diff --git a/tests/utils/test.out b/tests/utils/test.out
deleted file mode 100644
index d684e8882f6838f7d058a68b2a13df3f6c9dc6bf..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 196608
zcmeIvu?+w*2n0c7`tSMW1W|(ZH;yj!4pklj0t5&UAV7cs0RjXF5FkK+009Dj7U(*Y
zm$q+sTJ!Fm!y5z$5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+
z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly
zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF
z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=Dq
z1-j1sE^XiNwC3GAhc^fiAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U
zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7
z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N
z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+
z009CI3v`|NUE03kY0bNL4sQ@3K!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk
z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs
z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ
zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U
zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7
z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N
z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+
z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly
lK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=DDfe*9F0(Sra


From 6b1600e4ef3d94d6816d301f46231923356170e6 Mon Sep 17 00:00:00 2001
From: cpviolator <dmhowarth@gmail.com>
Date: Fri, 12 Jul 2024 06:35:38 -0700
Subject: [PATCH 30/30] mid merge

---
 include/CMakeLists.txt    |   2 +
 include/cuda_interface.h  |   2 +-
 include/dsaX.h            |  40 ++-
 include/dsaX_api.h        |  36 +++
 include/dsaX_def.h        |   7 +
 include/dsaX_malloc.h     | 113 +++++++
 include/dsaX_ptr.h        | 102 ++++++
 include/enums.h           |  11 +
 src/CMakeLists.txt        |   5 +-
 src/correlator.cpp        |   3 +-
 src/cuda_handles.cu       |   8 +-
 src/cuda_interface.cu     |  89 ++++--
 src/dsaX_api.cu           |  43 +++
 src/dsaX_ptr.cpp          | 155 ++++++++++
 src/interface.cpp         |   3 +
 src/malloc.cu             | 631 ++++++++++++++++++++++++++++++++++++++
 tests/correlator_test.cpp |  10 +-
 17 files changed, 1220 insertions(+), 40 deletions(-)
 create mode 100644 include/dsaX_api.h
 create mode 100644 include/dsaX_malloc.h
 create mode 100644 include/dsaX_ptr.h
 create mode 100644 src/dsaX_api.cu
 create mode 100644 src/dsaX_ptr.cpp
 create mode 100644 src/malloc.cu

diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 65ddb04..9a7cbbd 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -6,6 +6,8 @@ set(DSA_XENGINE_HEADERS
   # cmake-format: sortable
   dsaX.h
   dsaX_def.h
+  dsaX_malloc.h
+  dsaX_ptr.h
   fast_time_domain.h  
   cuda_interface.h
   cuda_handles.h
diff --git a/include/cuda_interface.h b/include/cuda_interface.h
index 6ae59e2..42043e2 100644
--- a/include/cuda_interface.h
+++ b/include/cuda_interface.h
@@ -7,7 +7,7 @@
 #include "dsaX.h"
 
 void dsaXInitCuda(int dev);
-void dsaXDestroyCuda(int dev);
+void dsaXDestroyCuda();
 
 void initBLASCuda();
 void destroyBLASCuda();
diff --git a/include/dsaX.h b/include/dsaX.h
index ff2772c..f370bc0 100644
--- a/include/dsaX.h
+++ b/include/dsaX.h
@@ -11,20 +11,40 @@
 // Uncomment to try new pure cuBLAS
 //#define OLD_BLAS
 
-// required to prevent overflow in corr matrix multiply
-#define halfFac 4
-
-// beam sep
-#define sep 1.0 // arcmin
-
+/**
+ * Initialize the library. This function will initialise
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas.
+ * @param[in] device_ordinal The GPU device to init
+ */
 void dsaXInit(int device_ordinal = -1);
 
+/**
+ * Finalize the library. This function will finalize
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas. It will also dump any statistics
+ * collected, such as performance metrics.
+ */
 void dsaXEnd();
 
-
-
+/**
+ * This function will allocate pinned device memory of the 
+ * given size in bytes, and return a void pointer to that
+ * memory. The user may delete the memory safely in their
+ * application code.
+ * @param[in] size The byte size of pinned memory to be allocated 
+ *                 by dsaX.
+ */
 void *dsaXHostRegister(size_t size);
 
+/**
+ * This function allows the user to inspect the (4b,4b) char sized
+ * complex data at byte address i on the host. If 'non-zero' is true
+ * then the complex element will print only if either the real
+ * or imaginary element is non-zero. Useful for checking if 
+ * an array is populated.
+ * @param[in] input    The (4b,4b) char input array
+ * @param[in] i        The ith element of the array
+ * @param[in] non-zero If true, print only elements with non-zero values
+ */
 void inspectPackedData(char input, int i, bool non_zero = false);
-
-void dsaXCorrelator(void *output_data, void *input_data, dsaXCorrParam *param);
diff --git a/include/dsaX_api.h b/include/dsaX_api.h
new file mode 100644
index 0000000..3767600
--- /dev/null
+++ b/include/dsaX_api.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <string>
+
+#include "enums.h"
+
+#define STRINGIFY__(x) #x
+#define __STRINGIFY__(x) STRINGIFY__(x)
+
+/**
+   @brief Wrapper around cudaMemcpy or driver API equivalent
+   @param[out] dst Destination pointer
+   @param[in] src Source pointer
+   @param[in] count Size of transfer
+   @param[in] kind Type of memory copy
+*/
+void dsaXMemcpy_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const char *func, const char *file,
+		 const char *line);
+
+/**
+   @brief Wrapper around cudaMemcpyAsync or driver API equivalent
+   @param[out] dst Destination pointer
+   @param[in] src Source pointer
+   @param[in] count Size of transfer
+   @param[in] kind Type of memory copy
+   @param[in] stream Stream to issue copy
+*/
+void dsaXMemcpyAsync_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const cudaStream_t &stream,
+		      const char *func, const char *file, const char *line);
+
+
+#define dsaXMemcpy(dst, src, count, kind)                                                                              \
+  ::dsaXMemcpy_(dst, src, count, kind, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
+
+#define dsaXMemcpyAsync(dst, src, count, kind, stream)                                                                 \
+  ::dsaXMemcpyAsync_(dst, src, count, kind, stream, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
diff --git a/include/dsaX_def.h b/include/dsaX_def.h
index 257f493..5b3af78 100644
--- a/include/dsaX_def.h
+++ b/include/dsaX_def.h
@@ -91,3 +91,10 @@
 #define NBMS 256
 #define P_SIZE 4108
 #define NWAIT 100000
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
+
diff --git a/include/dsaX_malloc.h b/include/dsaX_malloc.h
new file mode 100644
index 0000000..04d24b0
--- /dev/null
+++ b/include/dsaX_malloc.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <unistd.h>   // for getpagesize()
+#include <execinfo.h> // for backtrace
+#include <map>        // for std::map
+
+#include <dsaX.h>
+#include <enums.h>
+
+
+using namespace std;
+
+// strip path from __FILE__
+// DMH: Place somewhere more sensible when working
+constexpr const char *str_end(const char *str) { return *str ? str_end(str + 1) : str; }
+constexpr bool str_slant(const char *str) { return *str == '/' ? true : (*str ? str_slant(str + 1) : false); }
+constexpr const char *r_slant(const char *str) { return *str == '/' ? (str + 1) : r_slant(str - 1); }
+constexpr const char *file_name(const char *str) { return str_slant(str) ? r_slant(str_end(str)) : str; }
+
+// Define wrappers around function. May wish to place <function>_
+// methods in a dsaX namespace later
+void *pinned_malloc_(const char *func, const char *file, int line, size_t size);
+#define pinned_malloc(size) pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *device_malloc_(const char *func, const char *file, int line, size_t size);
+#define device_malloc(size) device_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size);
+#define device_pinned_malloc(size) device_pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *safe_malloc_(const char *func, const char *file, int line, size_t size);
+#define safe_malloc(size) safe_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *mapped_malloc_(const char *func, const char *file, int line, size_t size);
+#define mapped_malloc(size) mapped_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *managed_malloc_(const char *func, const char *file, int line, size_t size);
+#define managed_malloc(size) managed_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void managed_free_(const char *func, const char *file, int line, void *ptr);
+#define managed_free(ptr) managed_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void device_free_(const char *func, const char *file, int line, void *ptr);
+#define device_free(ptr) device_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void device_pinned_free_(const char *func, const char *file, int line, void *ptr);
+#define device_pinned_free(ptr) device_pinned_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void host_free_(const char *func, const char *file, int line, void *ptr);
+#define host_free(ptr) host_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+/*
+  @brief Get device view of a host-mapped pointer
+*/
+void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr);
+#define get_mapped_device_pointer(ptr) get_mapped_device_pointer_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+// Create a mem_pool namespace to differentiate
+// bewtween regular memory management methods
+// and those utilising memory pooling
+namespace mem_pool {
+
+  /**
+     @brief Initialize the memory pool allocator
+  */
+  void init();
+  
+  /**
+     @brief Allocate device-memory.  If free pre-existing allocation exists
+     reuse this.
+     @param size Size of allocation
+     @return Pointer to allocated memory
+  */
+  void *device_malloc_(const char *func, const char *file, int line, size_t size);
+  
+  /**
+     @brief Virtual free of pinned-memory allocation.
+     @param ptr Pointer to be (virtually) freed
+  */
+  void device_free_(const char *func, const char *file, int line, void *ptr);
+  
+  /**
+     @brief Allocate pinned-memory.
+     If a free pre-existing allocation exists, reuse this.
+     @param size Size of allocation
+     @return Pointer to allocated memory
+  */
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t size);
+  
+  /**
+     @brief Virtual free of pinned-memory allocation.
+     @param ptr Pointer to be (virtually) freed
+  */
+  void pinned_free_(const char *func, const char *file, int line, void *ptr);
+
+  /**
+     @brief Free all outstanding device-memory allocations.
+  */
+  void flush_device();
+  
+  /**
+     @brief Free all outstanding pinned-memory allocations.
+  */
+  void flush_pinned();  
+}
+
+#define pool_device_malloc(size) mem_pool::device_malloc_(__func__, __FILE__, __LINE__, size)
+#define pool_device_free(ptr) mem_pool::device_free_(__func__, __FILE__, __LINE__, ptr)
+#define pool_pinned_malloc(size) mem_pool::pinned_malloc_(__func__, __FILE__, __LINE__, size)
+#define pool_pinned_free(ptr) mem_pool::pinned_free_(__func__, __FILE__, __LINE__, ptr)
+
diff --git a/include/dsaX_ptr.h b/include/dsaX_ptr.h
new file mode 100644
index 0000000..de452f0
--- /dev/null
+++ b/include/dsaX_ptr.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <ostream>
+#include "dsaX_malloc.h"
+
+/**
+   Object that stores a memory allocation with different views for
+   host or device.  Depending on the nature of the underlying memory
+   type, both views may not be defined
+
+   type                       defined views
+   DSAX_MEMORY_DEVICE         device only
+   DSAX_MEMORY_DEVICE_PINNED  device only
+   DSAX_MEMORY_HOST           host only
+   DSAX_MEMORY_HOST_PINNED    both
+   DSAX_MEMORY_MAPPED         both (pinned to host)
+   DSAX_MEMORY_MANAGED        both
+*/
+class dsaX_ptr
+{
+  friend std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr);
+  dsaXMemoryType type = DSA_MEMORY_INVALID;  /** Memory type of the allocation */
+  size_t size = 0;                           /** Size of the allocation */
+  bool pool = false;                         /** Is the allocation is pooled */
+  void *device = nullptr;                    /** Device-view of the allocation */
+  void *host = nullptr;                      /** Host-view of the allocation */
+  bool reference = false;                    /** Is this a reference to another allocation */
+
+  /**
+     @brief Internal deallocation routine
+  */
+  void destroy();
+
+public:
+  dsaX_ptr() = default;
+  dsaX_ptr(dsaX_ptr &&) = default;
+  dsaX_ptr &operator=(dsaX_ptr &&);
+  dsaX_ptr(const dsaX_ptr &) = delete;
+  dsaX_ptr &operator=(const dsaX_ptr &) = delete;
+
+  /**
+     @brief Constructor for dsaX_ptr
+     @param[in] type The memory type of the allocation
+     @param[in] size The size of the allocation
+     @param[in] pool Whether the allocation should be in the memory pool (default is true)
+  */
+  dsaX_ptr(dsaXMemoryType type, size_t size, bool pool = true);
+
+  /**
+     @brief Constructor for dsaX_ptr where we are wrapping a non-owned pointer
+     @param[in] ptr Raw base pointer
+     @param[in] type The memory type of the allocation
+  */
+  dsaX_ptr(void *ptr, dsaXMemoryType type);
+
+  /**
+     @brief Destructor for the dsaX_ptr
+  */
+  virtual ~dsaX_ptr();
+
+  /**
+     @brief Specialized exchange function to use in place of
+     std::exchange when exchanging dsaX_ptr objects: moves obj to
+     *this, and moves new_value to obj
+     @param[in,out] obj
+     @param[in] new_value New value for obj to take
+  */
+  void exchange(dsaX_ptr &obj, dsaX_ptr &&new_value);
+
+  /**
+     @return Returns true if allocation is visible to the device
+  */
+  bool is_device() const;
+
+  /**
+     @return Returns true if allocation is visible to the host
+  */
+  bool is_host() const;
+
+  /**
+     Return view of the pointer.  For mapped memory we return the device view.
+  */
+  void *data() const;
+
+  /**
+     Return the device view of the pointer
+  */
+  void *data_device() const;
+
+  /**
+     Return the host view of the pointer
+  */
+  void *data_host() const;
+
+  /**
+     Return if the instance is a reference rather than an allocation
+  */
+  bool is_reference() const;
+};
+
+std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr);
+
diff --git a/include/enums.h b/include/enums.h
index 607d9d3..aa86573 100644
--- a/include/enums.h
+++ b/include/enums.h
@@ -2,6 +2,16 @@
 
 #define DSA_INVALID_ENUM (-0x7fffffff - 1)
 
+typedef enum dsaXMemoryType_s {
+  DSA_MEMORY_DEVICE,
+  DSA_MEMORY_DEVICE_PINNED,
+  DSA_MEMORY_HOST,
+  DSA_MEMORY_HOST_PINNED,
+  DSA_MEMORY_MAPPED,
+  DSA_MEMORY_MANAGED,
+  DSA_MEMORY_INVALID = DSA_INVALID_ENUM
+} dsaXMemoryType;
+
 typedef enum dsaXError_t {
   DSA_SUCCESS = 0,
   DSA_ERROR = 1,
@@ -63,3 +73,4 @@ typedef enum dsaXMemcpyKind_s {
   dsaXMemcpyDeviceToDeviceAsync = 7,
   dsaXMemcpyInvalid = DSA_INVALID_ENUM
 } dsaXMemcpyKind;
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 67f8543..de05a16 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,11 +13,14 @@ endif()
 set(DSAX_OBJS
   cuda_interface.cu
   cublas_interface.cu
+  malloc.cu
+  dsaX_ptr.cpp
   cuda_handles.cu
   magma_interface.cu
   blas_interface.cpp
   beamformer.cpp
   dsaX_base.cpp
+
   correlator.cpp
   interface.cpp
   utils.cpp
@@ -65,7 +68,7 @@ add_library(DSA_XENGINE::dsaX ALIAS dsaX)
 target_sources(dsaX PRIVATE $<TARGET_OBJECTS:dsax_cpp> ${DSAX_CU_OBJS})
 
 if(CUDAToolkit_FOUND)
-  target_link_libraries(dsaX INTERFACE CUDA::cudart_static ${CUDA_cublas_LIBRARY})
+  target_link_libraries(dsaX INTERFACE CUDA::cuda_driver CUDA::cudart_static ${CUDA_cublas_LIBRARY})
 endif()
 
 if(DSA_XENGINE_ENABLE_PSRDADA)
diff --git a/src/correlator.cpp b/src/correlator.cpp
index e45595d..2662e58 100644
--- a/src/correlator.cpp
+++ b/src/correlator.cpp
@@ -188,7 +188,7 @@ void Correlator::compute(void *output, void *input) {
   dsaXDeviceSynchronize();
 }
 
- 
+/*
 // correlator function
 // workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host
 // DMH: CUDA references excised. Make me a class
@@ -282,3 +282,4 @@ void dcorrelator(corr_handle *d) {
   // reorder output data
   reorderCorrOutput(d);
 }
+*/
diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu
index 9b65281..1b756d0 100644
--- a/src/cuda_handles.cu
+++ b/src/cuda_handles.cu
@@ -10,10 +10,10 @@ using namespace std;
 //-------------------------
 void init_streams(unsigned int n_streams) {
 
-  if(n_streams < 2 || n_streams > 9) {
-    cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl;
-    exit(0);
-  }
+  //if(n_streams < 2 || n_streams > 9) {
+  //cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl;
+  //exit(0);
+  //}
   
   if(!stream_init) {
     streams.reserve(n_streams);
diff --git a/src/cuda_interface.cu b/src/cuda_interface.cu
index 51ee957..854b75c 100644
--- a/src/cuda_interface.cu
+++ b/src/cuda_interface.cu
@@ -5,10 +5,12 @@
 #include "cuda_interface.h"
 #include "cuda_kernels.h"
 #include "cuda_handles.h"
+// DMH: Everything in this file is CUDA aware.
 
-using namespace std;
+//#include "dsaX_malloc.h"
+#include "dsaX_ptr.h"
 
-// DMH: Everything in this file is CUDA aware.
+using namespace std;
 
 __global__ void deviceInspectHalfCI(half *input, int stage) {
   int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -36,8 +38,8 @@ void destroyStreamsCuda(){
   destroy_streams();
 }
 
-void dsaXDestroyCuda(int dev){
-  //
+void dsaXDestroyCuda(){
+  cudaDeviceReset();
 }
 
 void *dsaXHostRegisterCuda(size_t size) {
@@ -55,16 +57,34 @@ void *dsaXHostRegisterCuda(size_t size) {
 void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) {
 
   // for correlator
-  cudaMalloc((void **)(&d->d_input), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
-  cudaMalloc((void **)(&d->d_r), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
-  cudaMalloc((void **)(&d->d_i), sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
-  cudaMalloc((void **)(&d->d_tx), sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
-  cudaMalloc((void **)(&d->d_output), sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams);
-  cudaMalloc((void **)(&d->d_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
-  cudaMalloc((void **)(&d->d_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
-  cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
-  cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  
+  cudaMalloc((void **)(&d->d_input),   sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  //dsaX_ptr ptr = dsaX_ptr(DSA_MEMORY_DEVICE, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams, true);
 
+  //cout << &ptr << endl;
+  
+  //d->d_input = 
+    
+  cudaMalloc((void **)(&d->d_r),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  cudaMalloc((void **)(&d->d_i),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  //cudaMalloc((void **)(&d->d_tx),      sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_output),  sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_outr),    sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  cudaMalloc((void **)(&d->d_outi),    sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  //cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  //cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+
+  // Total device memeory
+  uint64_t mem_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(int)*NBASE;
+
+  cout << "mem_size = " << mem_size/pow(1024,3) << " GB"  << endl;
+  //exit(0);
   // DMH: fix me
   cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE);
 }
@@ -343,18 +363,45 @@ void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) {
   sum_beam<<<blocks,tpb>>>(input, output);  
 }
 
+// CUDA API wrappers
+// DMH: Wrap all these calls around a CHECK_ERROR to save on
+// lines of code
+void dsaXDeviceSynchronizeCuda() {
+
+  cudaError error = cudaSuccess;
+  cudaDeviceSynchronize();
+  if(error != cudaSuccess) {
+    cudaGetLastError();
+    exit(0);
+  }  
+}
+
 void dsaXmemsetCuda(void *array, int ch, size_t n){
-  cudaMemset(array, ch, n);
+  
+  cudaError error = cudaSuccess;  
+  error = cudaMemset(array, ch, n);
+  if(error != cudaSuccess) {
+    cudaGetLastError();
+    exit(0);
+  }
+  
 }
 
-void dsaXDeviceSynchronizeCuda() {
-  cudaDeviceSynchronize();
+void dsaXmallocCuda(void *array, size_t array_length){
+
+  // for correlator
+  //cudaMalloc((void **)(&d->d_input),   sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  //cudaMalloc((void **)(&d->d_r),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  //cudaMalloc((void **)(&d->d_i),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  
 }
 
 void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){
 
   cudaError error = cudaSuccess;
   cudaStream_t str = get_stream(stream);
+
+  cout << "kind = " << dsaXMemcpyHostToHost << endl;
   
   switch(kind) {
   case dsaXMemcpyHostToHost:
@@ -384,6 +431,12 @@ void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind ki
   default:
     std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl;
   }
-  if(error != cudaSuccess) cudaGetLastError();
+  
+  if(error != cudaSuccess) {
+    const char *string = cudaGetErrorString(error);
+    //cudaGetLastError();
+    //cudaGetErrorString(&string);
+    printf("dsaXmemcpyCuda failed with error %s\n", string);
+    exit(0);
+  }
 }
-
diff --git a/src/dsaX_api.cu b/src/dsaX_api.cu
new file mode 100644
index 0000000..8f26a49
--- /dev/null
+++ b/src/dsaX_api.cu
@@ -0,0 +1,43 @@
+
+
+
+void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+  }
+
+
+void dsaMemcpyAsync_(void *dst, const void *src, size_t count, dsaMemcpyKind kind, const qudaStream_t &stream,
+		     const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+
+    if (kind == qudaMemcpyDeviceToDevice) {
+      QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), stream, true, func, file, line);
+    } else {
+#ifdef USE_DRIVER_API
+      switch (kind) {
+      case qudaMemcpyDeviceToHost:
+        PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
+        break;
+      case qudaMemcpyHostToDevice:
+        PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+        break;
+      case qudaMemcpyDeviceToDevice:
+        PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)),
+                QUDA_PROFILE_MEMCPY_D2D_ASYNC);
+        break;
+      case qudaMemcpyDefault:
+        PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)),
+                QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC);
+        break;
+      default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
+      }
+#else
+      PROFILE(cudaMemcpyAsync(dst, src, count, qudaMemcpyKindToAPI(kind), get_stream(stream)),
+              kind == qudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+#endif
+    }
+  }
diff --git a/src/dsaX_ptr.cpp b/src/dsaX_ptr.cpp
new file mode 100644
index 0000000..702654d
--- /dev/null
+++ b/src/dsaX_ptr.cpp
@@ -0,0 +1,155 @@
+#include <utility>
+#include "dsaX_ptr.h"
+
+dsaX_ptr::dsaX_ptr(dsaXMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool) {
+  if (pool && (type != DSA_MEMORY_DEVICE && type != DSA_MEMORY_HOST_PINNED && type != DSA_MEMORY_HOST)) {    
+    printf("dsaX ERROR: Memory pool not available for memory type %d", type);
+    exit(0);
+  }
+  
+  if (size > 0) {
+    switch (type) {
+    case DSA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break;
+    case DSA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break;
+    case DSA_MEMORY_HOST: host = safe_malloc(size); break;
+    case DSA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break;
+    case DSA_MEMORY_MAPPED:
+      host = mapped_malloc(size);
+      device = get_mapped_device_pointer(host);
+      break;
+    case DSA_MEMORY_MANAGED:
+      host = managed_malloc(size);
+      device = host;
+      break;
+    default:
+      printf("dsaX ERROR: Unknown memory type %d", type);
+      exit(0);
+    }
+  }
+}
+
+dsaX_ptr::dsaX_ptr(void *ptr, dsaXMemoryType type) : type(type), reference(true) {
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+    device = ptr;
+    host = nullptr;
+    break;
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED:
+    device = nullptr;
+    host = ptr;
+    break;
+  case DSA_MEMORY_MANAGED:
+    device = ptr;
+    host = ptr;
+    break;
+  default:
+    printf("dsaX ERROR: Unsupported memory type %d", type);
+    exit(0);
+  }
+}
+
+dsaX_ptr &dsaX_ptr::operator=(dsaX_ptr &&other) {
+  if (&other != this) {
+    if (size > 0) {
+      printf("dsaX ERROR: Cannot move to already initialized dsaX_ptr");
+    }
+    type = std::exchange(other.type, DSA_MEMORY_INVALID);
+    size = std::exchange(other.size, 0);
+    pool = std::exchange(other.pool, false);
+    device = std::exchange(other.device, nullptr);
+    host = std::exchange(other.host, nullptr);
+  }
+  return *this;
+}
+
+void dsaX_ptr::destroy() {
+  if (size > 0) {
+    switch (type) {
+    case DSA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break;
+    case DSA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
+    case DSA_MEMORY_HOST: host_free(host); break;
+    case DSA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break;
+    case DSA_MEMORY_MAPPED: host_free(host); break;
+    default:
+      printf("Unknown memory type %d", type);
+      exit(0);
+    }
+  }
+
+  size = 0;
+  device = nullptr;
+  host = nullptr;
+}
+
+dsaX_ptr::~dsaX_ptr() {
+  destroy();
+}
+
+void dsaX_ptr::exchange(dsaX_ptr &obj, dsaX_ptr &&new_value) {
+  destroy();
+  *this = std::move(obj);
+  obj = std::move(new_value);
+}
+
+bool dsaX_ptr::is_device() const {
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+  case DSA_MEMORY_MAPPED:
+  case DSA_MEMORY_MANAGED: return true;
+  default: return false;
+  }
+}
+
+bool dsaX_ptr::is_host() const {
+  switch (type) {
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED:
+  case DSA_MEMORY_MANAGED: return true;
+  default: return false;
+  }
+}
+
+void *dsaX_ptr::data() const {
+  void *ptr = nullptr;
+
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+  case DSA_MEMORY_MAPPED:
+  case DSA_MEMORY_MANAGED: ptr = device; break;
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED: ptr = host; break;
+  default:
+    printf("Unknown memory type %d", type);
+    exit(0);
+  }
+
+  return ptr;
+}
+
+void *dsaX_ptr::data_device() const {
+  if (!device) {
+    printf("dsaX ERROR: Device view not defined");
+    exit(0);
+  }
+  return device;
+}
+
+void *dsaX_ptr::data_host() const {
+  if (!host) {
+    printf("dsaX ERROR: Host view not defined");
+    exit(0);
+  }
+  return host;
+}
+
+bool dsaX_ptr::is_reference() const { return reference; }
+
+std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr) {
+  output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool
+	 << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
+  return output;
+}
diff --git a/src/interface.cpp b/src/interface.cpp
index 31dc832..41e7caf 100644
--- a/src/interface.cpp
+++ b/src/interface.cpp
@@ -44,6 +44,9 @@ void dsaXEnd() {
   // output metrics
   cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl;
   cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl;
+#if DSA_XENGINE_TARGET_CUDA
+  dsaXDestroyCuda();
+#endif
 }
 
 void *dsaXHostRegister(size_t size) {
diff --git a/src/malloc.cu b/src/malloc.cu
new file mode 100644
index 0000000..55bad3f
--- /dev/null
+++ b/src/malloc.cu
@@ -0,0 +1,631 @@
+#include "dsaX_malloc.h"
+
+#include "cuda_headers.h"
+//#include "cuda_interface.h"
+//#include "cuda_kernels.h"
+//#include "cuda_handles.h"
+// DMH: Everything in this file is CUDA aware.
+
+enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, SHMEM, N_ALLOC_TYPE };
+
+class MemAlloc
+{
+  
+public:
+  std::string func;
+  std::string file;
+  int line;
+  size_t size;
+  size_t base_size;
+  
+  MemAlloc() : line(-1), size(0), base_size(0) {}
+  
+  MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0)
+  {
+  }
+  
+  MemAlloc(const MemAlloc &) = default;
+  MemAlloc(MemAlloc &&) = default;
+  virtual ~MemAlloc() = default;
+  MemAlloc &operator=(const MemAlloc &) = default;
+  MemAlloc &operator=(MemAlloc &&) = default;
+};
+
+static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
+static size_t total_bytes[N_ALLOC_TYPE] = {0};
+static size_t max_total_bytes[N_ALLOC_TYPE] = {0};
+static size_t total_host_bytes, max_total_host_bytes;
+static size_t total_pinned_bytes, max_total_pinned_bytes;
+
+size_t device_allocated() { return total_bytes[DEVICE]; }
+
+size_t pinned_allocated() { return total_bytes[PINNED]; }
+
+size_t mapped_allocated() { return total_bytes[MAPPED]; }
+
+size_t managed_allocated() { return total_bytes[MANAGED]; }
+
+size_t host_allocated() { return total_bytes[HOST]; }
+
+size_t device_allocated_peak() { return max_total_bytes[DEVICE]; }
+
+size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; }
+
+size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; }
+
+size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; }
+
+size_t host_allocated_peak() { return max_total_bytes[HOST]; }
+
+static void print_trace(void)
+{
+  void *array[10];
+  size_t size;
+  char **strings;
+  size = backtrace(array, 10);
+  strings = backtrace_symbols(array, size);
+  printf("Obtained %zd stack frames.\n", size);
+  for (size_t i = 0; i < size; i++) printf("%s\n", strings[i]);
+  free(strings);
+}
+
+static void print_alloc_header()
+{
+  printf("Type    Pointer          Size             Location\n");
+  printf("----------------------------------------------------------\n");
+}
+
+static void print_alloc(AllocType type)
+{
+  const char *type_str[] = {"Device", "Device Pinned", "Host  ", "Pinned", "Mapped", "Managed", "Shmem "};
+
+  for (auto entry : alloc[type]) {
+    void *ptr = entry.first;
+    MemAlloc a = entry.second;
+    printf("%s  %15p  %15lu  %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(),
+	   a.file.c_str(), a.line);
+  }
+}
+
+static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
+{
+  total_bytes[type] += a.base_size;
+  if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
+  if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) {
+    total_host_bytes += a.base_size;
+    if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
+  }
+  if (type == PINNED || type == MAPPED) {
+    total_pinned_bytes += a.base_size;
+    if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
+  }
+  alloc[type][ptr] = a;
+}
+
+static void track_free(const AllocType &type, void *ptr)
+{
+  size_t size = alloc[type][ptr].base_size;
+  total_bytes[type] -= size;
+  if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { total_host_bytes -= size; }
+  if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; }
+  alloc[type].erase(ptr);
+}
+
+void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *host)
+{
+  void *device;
+  auto error = cudaHostGetDevicePointer(&device, const_cast<void *>(host), 0);
+  if (error != cudaSuccess) {
+    printf("dsaX ERROR: cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line,
+	   func);
+  }
+  return device;
+}
+
+bool use_managed_memory() {
+  static bool managed = false;
+  static bool init = false;
+  
+  if (!init) {
+    char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY");
+    if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) {
+      printf("dsaX ERROR: Using managed memory for CUDA allocations");
+      managed = true;
+      
+      //if (!device::managed_memory_supported()) printf("dsaX WARNING: Target device does not report supporting managed memory");
+    }
+    
+    init = true;
+  }
+  
+  return managed;
+}
+
+/**
+ * Free device memory allocated with device_malloc().  This function
+ * should only be called via the device_free() macro, defined in
+ * malloc_quda.h
+ */
+void managed_free_(const char *func, const char *file, int line, void *ptr) {
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[MANAGED].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  cudaError_t err = cudaFree(ptr);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  track_free(MANAGED, ptr);
+}
+
+
+/**
+ * Free host memory allocated with safe_malloc(), pinned_malloc(),
+ * or mapped_malloc().  This function should only be called via the
+ * host_free() macro, defined in dsaX_malloc.h
+ */
+void host_free_(const char *func, const char *file, int line, void *ptr) {
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (alloc[HOST].count(ptr)) {
+    track_free(HOST, ptr);
+    free(ptr);
+  } else if (alloc[PINNED].count(ptr)) {
+    cudaError_t err = cudaHostUnregister(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(PINNED, ptr);
+    free(ptr);
+  } else if (alloc[MAPPED].count(ptr)) {
+#ifdef HOST_ALLOC
+    cudaError_t err = cudaFreeHost(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to free host memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(MAPPED, ptr);
+#else
+    cudaError_t err = cudaHostUnregister(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(MAPPED, ptr);
+    free(ptr);
+#endif
+  } else {
+    printf("dsaX ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
+    print_trace();
+    printf("dsaX ERROR: Aborting");
+    exit(0);
+  }
+}
+
+
+/**
+ * Perform a standard cudaMalloc() with error-checking.  This
+ * function should only be called via the device_malloc() macro,
+ * defined in dsaX_malloc.h
+ */
+void *device_malloc_(const char *func, const char *file, int line, size_t size) {
+  
+  if (use_managed_memory()) return managed_malloc_(func, file, line, size);
+
+  MemAlloc a(func, file, line);
+  void *ptr;
+
+  a.size = a.base_size = size;
+
+  cudaError_t err = cudaMalloc(&ptr, size);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+
+  // DMH: GET ON THIS! 
+  //if (is_prefetch_enabled()) dsaXMemPrefetchAsync(ptr, size, DSA_CUDA_FIELD_LOCATION, get_default_stream());
+  track_malloc(DEVICE, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+/**
+ * Free device memory allocated with device_malloc().  This function
+ * should only be called via the device_free() macro, defined in
+ * dsaX_malloc.h
+ */
+void device_free_(const char *func, const char *file, int line, void *ptr) {
+  
+  if (use_managed_memory()) {
+    managed_free_(func, file, line, ptr);
+    return;
+  }
+
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[DEVICE].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+
+  cudaError_t err = cudaFree(ptr);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  
+  track_free(DEVICE, ptr);
+}
+
+/**
+ * Free device memory allocated with device_pinned malloc().  This
+ * function should only be called via the device_pinned_free()
+ * macro, defined in dsaX_malloc.h
+ */
+void device_pinned_free_(const char *func, const char *file, int line, void *ptr) {
+
+  //DMH: I would think that we will always be using hardware with
+  //     compute >= 2.0, but this can be implemeneted later if needed.
+  //if (!comm_peer2peer_present()) {
+  //device_free_(func, file, line, ptr);
+  //return;
+  //}
+
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[DEVICE_PINNED].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  CUresult err = cuMemFree((CUdeviceptr)ptr);
+  if (err != CUDA_SUCCESS) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  track_free(DEVICE_PINNED, ptr);
+}
+
+
+/**
+ * Under CUDA 4.0, cudaHostRegister seems to require that both the
+ * beginning and end of the buffer be aligned on page boundaries.
+ * This local function takes care of the alignment and gets called
+ * by pinned_malloc_() and mapped_malloc_()
+ */
+static void *aligned_malloc(MemAlloc &a, size_t size) {
+  void *ptr = nullptr;
+  
+  a.size = size;
+  
+  // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
+  static int page_size = 2 * getpagesize();
+  a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size                                                                                                
+  int align = posix_memalign(&ptr, page_size, a.base_size);
+  if (!ptr || align != 0) {
+    printf("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
+	   a.func.c_str());
+    exit(0);
+  }
+  return ptr;
+}
+
+/**
+ * Perform a standard malloc() with error-checking.  This function
+ * should only be called via the safe_malloc() macro, defined in
+ * malloc_quda.h
+ */
+void *safe_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  a.size = a.base_size = size;
+  
+  void *ptr = malloc(size);
+  if (!ptr) {
+    printf("dsaX ERROR: Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(HOST, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+/**
+ * Allocate page-locked ("pinned") host memory, and map it into the
+ * GPU address space.  This function should only be called via the
+ * mapped_malloc() macro, defined in malloc_quda.h
+ */
+void *mapped_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  
+  void *ptr = aligned_malloc(a, size);
+  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped | cudaHostRegisterPortable);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(MAPPED, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, a.base_size);
+#endif
+  return ptr;
+}
+
+/**
+ * Perform a standard cudaMallocManaged() with error-checking.  This
+ * function should only be called via the managed_malloc() macro,
+ * defined in dsaX_malloc.h
+ */
+void *managed_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  void *ptr;
+
+  a.size = a.base_size = size;
+
+  cudaError_t err = cudaMallocManaged(&ptr, size);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(MANAGED, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+
+/**
+ * Perform a cuMemAlloc with error-checking.  This function is to
+ * guarantee a unique memory allocation on the device. This
+ * should only be called via the device_pinned_malloc() macro,
+ * defined in dsaX_malloc.h.
+ */
+void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  //DMH: I would think that we will always be using hardware with
+  //     compute >= 2.0, but this can be implemeneted later if needed.
+  //if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size);
+  
+  MemAlloc a(func, file, line);
+  void *ptr;
+  
+  a.size = a.base_size = size;
+  
+  CUresult err = cuMemAlloc((CUdeviceptr *)&ptr, size);
+  if (err != CUDA_SUCCESS) {
+    printf("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(DEVICE_PINNED, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+
+/**
+ * Allocate page-locked ("pinned") host memory.  This function
+ * should only be called via the pinned_malloc() macro, defined in
+ * dsaX_malloc.h
+ *
+ * Note that we do not rely on cudaHostAlloc(), since buffers
+ * allocated in this way have been observed to cause problems when
+ * shared with MPI via GPU Direct on some systems.
+ */
+void *pinned_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  void *ptr = aligned_malloc(a, size);
+  
+  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault);
+  if (err != cudaSuccess) {
+    printf("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(PINNED, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, a.base_size);
+#endif
+  return ptr;
+}
+
+namespace mem_pool {
+
+  /** Cache of inactive pinned-memory allocations.  We cache pinned
+      memory allocations so that fields can reuse these with minimal
+      overhead.
+  */
+  static std::multimap<size_t, void *> pinnedCache;
+
+  /** Sizes of active pinned-memory allocations.  For convenience,
+      we keep track of the sizes of active allocations (i.e., those not
+      in the cache). 
+  */
+  static std::map<void *, size_t> pinnedSize;
+  
+  /** Cache of inactive device-memory allocations.  We cache pinned
+      memory allocations so that fields can reuse these with minimal
+      overhead.
+  */
+  static std::multimap<size_t, void *> deviceCache;
+  
+  /** Sizes of active device-memory allocations.  For convenience,
+      we keep track of the sizes of active allocations (i.e., those not
+      in the cache). 
+  */
+  static std::map<void *, size_t> deviceSize;
+  
+  static bool pool_init = false;
+  
+  /** whether to use a memory pool allocator for device memory */
+  static bool device_memory_pool = true;
+  
+  /** whether to use a memory pool allocator for pinned memory */
+  static bool pinned_memory_pool = true;
+
+  void init() {
+    if (!pool_init) {
+      // device memory pool
+      char *enable_device_pool = getenv("DSAX_ENABLE_DEVICE_MEMORY_POOL");
+      if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) {
+	printf("dsaX Warning: Using device memory pool allocator");
+	device_memory_pool = true;
+      } else {
+	printf("dsaX Warning: Not using device memory pool allocator");
+	device_memory_pool = false;
+      }
+      
+      // pinned memory pool
+      char *enable_pinned_pool = getenv("DSAX_ENABLE_PINNED_MEMORY_POOL");
+      if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) {
+	printf("dsaX Warning: Using pinned memory pool allocator");
+	pinned_memory_pool = true;
+      } else {
+	printf("dsaX Warning: Not using pinned memory pool allocator");
+	pinned_memory_pool = false;
+      }
+      pool_init = true;
+    }
+  }
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes) {
+    void *ptr = nullptr;
+    if (pinned_memory_pool) {
+      if (pinnedCache.empty()) {
+	ptr = pinned_malloc_(func, file, line, nbytes);
+      } else {
+	auto it = pinnedCache.lower_bound(nbytes);
+	if (it != pinnedCache.end()) { // sufficiently large allocation found
+	  nbytes = it->first;
+	  ptr = it->second;
+	  pinnedCache.erase(it);
+	} else { // sacrifice the smallest cached allocation
+	  it = pinnedCache.begin();
+	  ptr = it->second;
+	  pinnedCache.erase(it);
+	  host_free(ptr);
+	  ptr = pinned_malloc_(func, file, line, nbytes);
+	}
+      }
+      pinnedSize[ptr] = nbytes;
+    } else {
+      ptr = pinned_malloc_(func, file, line, nbytes);
+    }
+    return ptr;
+  }
+    
+  void pinned_free_(const char *func, const char *file, int line, void *ptr) {
+    if (pinned_memory_pool) {
+      if (!pinnedSize.count(ptr)) {
+	printf("dsaX Error: Attempt to free invalid pointer");
+	exit(0);
+      }
+      pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
+      pinnedSize.erase(ptr);
+    } else {
+      host_free_(func, file, line, ptr);
+    }
+  }
+
+  void *device_malloc_(const char *func, const char *file, int line, size_t nbytes) {
+    void *ptr = nullptr;
+    if (device_memory_pool) {
+      if (deviceCache.empty()) {
+	ptr = device_malloc_(func, file, line, nbytes);
+      } else {
+	auto it = deviceCache.lower_bound(nbytes);
+	if (it != deviceCache.end()) { // sufficiently large allocation found
+	  nbytes = it->first;
+	  ptr = it->second;
+	  deviceCache.erase(it);
+	} else { // sacrifice the smallest cached allocation
+	  it = deviceCache.begin();
+	  ptr = it->second;
+	  deviceCache.erase(it);
+	  device_free_(func, file, line, ptr);
+	  ptr = device_malloc_(func, file, line, nbytes);
+	}
+      }
+      deviceSize[ptr] = nbytes;
+    } else {
+      ptr = device_malloc_(func, file, line, nbytes);
+    }
+    return ptr;
+  }
+
+  /**
+   * Free device memory allocated with device_pinned malloc().  This
+   * function should only be called via the device_pinned_free()
+   * macro, defined in malloc_quda.h
+   */
+  void device_pinned_free_(const char *func, const char *file, int line, void *ptr) {
+    //DMH: I would think that we will always be using hardware with
+    //     compute >= 2.0, but this can be implemeneted later if needed
+    //if (!comm_peer2peer_present()) {
+    //device_free_(func, file, line, ptr);
+    //return;
+    //}
+
+    if (!ptr) {
+      printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    if (!alloc[DEVICE_PINNED].count(ptr)) {
+      printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    CUresult err = cuMemFree((CUdeviceptr)ptr);
+    if (err != CUDA_SUCCESS) {
+      printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(DEVICE_PINNED, ptr);
+  }
+
+  
+  void device_free_(const char *func, const char *file, int line, void *ptr) {
+    if (device_memory_pool) {
+      if (!deviceSize.count(ptr)) {
+	printf("dsaX Error: Attempt to free invalid pointer");
+	exit(0);
+      }
+      deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
+      deviceSize.erase(ptr);
+    } else {
+      device_free_(func, file, line, ptr);
+    }
+  }
+  
+  void flush_pinned() {
+    if (pinned_memory_pool) {
+      for (auto it : pinnedCache) { host_free(it.second); }
+      pinnedCache.clear();
+    }
+  }
+  
+  void flush_device() {
+    if (device_memory_pool) {
+      for (auto it : deviceCache) { device_free(it.second); }
+      deviceCache.clear();
+    }
+  }  
+} // namespace pool
diff --git a/tests/correlator_test.cpp b/tests/correlator_test.cpp
index 6f8d6df..3cdc699 100644
--- a/tests/correlator_test.cpp
+++ b/tests/correlator_test.cpp
@@ -13,6 +13,9 @@ using namespace std;
 // Include this file to access input parameters
 #include "command_line_params.h"
 
+// Include the dsaX.h header in your application
+#include <dsaX.h>
+
 // Include this file to access test utilities
 /**
  * Promote complex char riri... data to planar half rr.. ii.. 
@@ -110,9 +113,7 @@ template <typename prec> prec test_hermiticity(const prec *C, const int m, const
   return frob_norm/(m*n*2);
 }
 
-// Include the dsaX.h header in your application
-#include <dsaX.h>
-
+/*
 // The class offers entire file content read/write in single operation
 class BinaryFileVector : public vector<char>
 {
@@ -180,7 +181,7 @@ class BinaryFileVector : public vector<char>
     }
   }
 };
-
+*/
 int main(int argc, char **argv) {
 
   // Parse command line
@@ -297,7 +298,6 @@ int main(int argc, char **argv) {
 
   float frob_norm = test_hermiticity((float*)output_data, 96, 96);
   cout << "Frobenius norm = " << frob_norm << endl;
-
   
   //cout << "Output peek " << endl;
   float *p = (float*)output_data;