diff --git a/util/tuner/GPU_Microbenchmark/.gitignore b/util/tuner/GPU_Microbenchmark/.gitignore
deleted file mode 100644
index 8200ece07..000000000
--- a/util/tuner/GPU_Microbenchmark/.gitignore
+++ /dev/null
@@ -1,60 +0,0 @@
-bin/
-*.csv
-ubench/atomics/Atomic_add_bw/atomic_add_bw
-ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict
-ubench/atomics/Atomic_add_lat/atomic_add_lat
-ubench/core/MaxFlops_double/MaxFlops_double
-ubench/core/MaxFlops_float/MaxFlops_float
-ubench/core/MaxFlops_half/MaxFlops_half
-ubench/core/MaxFlops_int32/MaxFlops_int32
-ubench/core/config_dpu/config_dpu
-ubench/core/config_fpu/config_fpu
-ubench/core/config_int/config_int
-ubench/core/config_sfu/config_sfu
-ubench/core/config_tensor/config_tensor
-ubench/core/config_udp/config_udp
-ubench/core/core_config/core_config
-ubench/core/lat_double/lat_double
-ubench/core/lat_float/lat_float
-ubench/core/lat_half/lat_half
-ubench/core/lat_int32/lat_int32
-ubench/core/regfile_bw/regfile_bw
-ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt
-ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt
-ubench/core/tensor_bw_half/tensor_bw_half
-ubench/core/tensor_lat_half/tensor_lat_half
-ubench/l1_cache/l1_access_grain/l1_access_grain
-ubench/l1_cache/l1_adaptive/l1_adaptive
-ubench/l1_cache/l1_associativity/l1_associativity
-ubench/l1_cache/l1_banks/l1_banks
-ubench/l1_cache/l1_bw_128/l1_bw_128
-ubench/l1_cache/l1_bw_32f/l1_bw_32f
-ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll
-ubench/l1_cache/l1_bw_64f/l1_bw_64f
-ubench/l1_cache/l1_bw_64v/l1_bw_64v
-ubench/l1_cache/l1_config/l1_config
-ubench/l1_cache/l1_lat/l1_lat
-ubench/l1_cache/l1_mshr/l1_mshr
-ubench/l1_cache/l1_sector/l1_sector
-ubench/l1_cache/l1_shared_bw/l1_shared_bw
-ubench/l1_cache/l1_write_policy/l1_write_policy
-ubench/l2_cache/l2_access_grain/l2_access_grain
-ubench/l2_cache/l2_bw_128/l2_bw_128
-ubench/l2_cache/l2_bw_32f/l2_bw_32f
-ubench/l2_cache/l2_bw_64f/l2_bw_64f
-ubench/l2_cache/l2_config/l2_config
-ubench/l2_cache/l2_copy_engine/l2_copy_engine
-ubench/l2_cache/l2_lat/l2_lat
-ubench/l2_cache/l2_write_policy/l2_write_policy
-ubench/mem/mem_atom_size/mem_atom_size
-ubench/mem/mem_bw/mem_bw
-ubench/mem/mem_config/mem_config
-ubench/mem/mem_lat/mem_lat
-ubench/shd/shared_bw/shared_bw
-ubench/shd/shared_bw_64/shared_bw_64
-ubench/shd/shared_lat/shared_lat
-ubench/shd/shd_config/shd_config
-ubench/system/deviceQuery/deviceQuery
-ubench/system/kernel_lat/kernel_lat
-ubench/system/system_config/system_config
-ubench/system/list_devices/list_devices
diff --git a/util/tuner/GPU_Microbenchmark/Makefile b/util/tuner/GPU_Microbenchmark/Makefile
deleted file mode 100755
index 5f901780d..000000000
--- a/util/tuner/GPU_Microbenchmark/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-BASE_DIR := $(shell pwd)
-BIN_DIR := $(BASE_DIR)/bin
-SUB_DIRS        = $(wildcard ubench/*/*/)
-SUB_DIRS_ALL    = $(SUB_DIRS:%=all-%)
-SUB_DIRS_CLEAN  = $(SUB_DIRS:%=clean-%)
-
-all: create_dir $(SUB_DIRS_ALL)
-
-clean: delete_dir $(SUB_DIRS_CLEAN)
-
-$(SUB_DIRS_ALL):
-	$(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%)
-
-$(SUB_DIRS_CLEAN):
-	$(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean
-
-create_dir:
-	mkdir -p $(BIN_DIR)
-
-delete_dir:
-	cd $(BIN_DIR); rm -f *
diff --git a/util/tuner/GPU_Microbenchmark/README.md b/util/tuner/GPU_Microbenchmark/README.md
deleted file mode 100644
index 52c39a2a3..000000000
--- a/util/tuner/GPU_Microbenchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# GPU_Microbenchmark
diff --git a/util/tuner/GPU_Microbenchmark/common/common.mk b/util/tuner/GPU_Microbenchmark/common/common.mk
deleted file mode 100644
index 6c90a3f67..000000000
--- a/util/tuner/GPU_Microbenchmark/common/common.mk
+++ /dev/null
@@ -1,48 +0,0 @@
-BASE_DIR := $(shell pwd)
-BIN_DIR := $(BASE_DIR)/../../../bin/
-
-GENCODE_SM30 ?= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
-GENCODE_SM35 ?= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
-GENCODE_SM50 ?= -gencode=arch=compute_50,code=\"sm_50,compute_50\"
-GENCODE_SM60 ?= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
-GENCODE_SM62 ?= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
-GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
-GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\"
-GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\"
-GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\"
-
-CUOPTS =  $(GENCODE_ARCH) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM62) $(GENCODE_SM70) $(GENCODE_SM75) $(GENCODE_SM80)
-
-CC := nvcc
-
-# CUDA_PATH ?= /use/local/cuda-10.1/
-INCLUDE := $(GPUAPPS_ROOT)/src/cuda/cuda-samples/Common/
-LIB :=
-
-release:
-	$(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
-	cp $(EXE) $(BIN_DIR)
-
-clean:
-	rm -f *.o; rm -f $(EXE)
-
-run:
-	./$(EXE)
-
-profile:
-	nvprof ./$(EXE)
-
-events:
-	nvprof  --events elapsed_cycles_sm ./$(EXE)
-
-profileall:
-	nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE)
-
-nvsight:
-	nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum  --csv --page raw ./$(EXE) | tee nsight.csv
-
-ptx:
-	cuobjdump -ptx ./$(EXE)  tee ptx.txt
-
-sass:
-	cuobjdump -sass ./$(EXE)  tee sass.txt
diff --git a/util/tuner/GPU_Microbenchmark/format-code.sh b/util/tuner/GPU_Microbenchmark/format-code.sh
deleted file mode 100755
index f06cc7629..000000000
--- a/util/tuner/GPU_Microbenchmark/format-code.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /bin/sh
-
-THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
-clang-format -i ${THIS_DIR}/ubench/*/*/*.cu
-clang-format -i ${THIS_DIR}/ubench/*/*/*.h
-clang-format -i ${THIS_DIR}/ubench/*/*/*.cpp
-clang-format -i ${THIS_DIR}/hw_def/*/*.h
-clang-format -i ${THIS_DIR}/hw_def/common/*.h
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h
deleted file mode 100644
index fc95b6dc2..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// These are the configration parameters that can be found publicly
-// Sources:
-// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
-// https://en.wikipedia.org/wiki/GeForce_30_series
-// https://en.wikipedia.org/wiki/CUDA
-
-#ifndef AMPERE_RTX3070_DEF_H
-#define AMPERE_RTX3070_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (192 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1410 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single // single issue core or dual issue
-#define CORE_MODEL core_model::subcore  // subcore model or shared model
-#define DRAM_MODEL dram_model::HBM    // memory type
-#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
-
-// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
-// see slide 22 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define SASS_hmma_per_PTX_wmma 2
-
-// These vars are almost constant between HW generation
-// see slide 24 from Nvidia at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h
deleted file mode 100644
index 133711416..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// These are the configration parameters that can be found publicly
-// Sources:
-// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
-// https://en.wikipedia.org/wiki/GeForce_30_series
-// https://en.wikipedia.org/wiki/CUDA
-
-#ifndef AMPERE_RTX3070_DEF_H
-#define AMPERE_RTX3070_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (128 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1132 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single // single issue core or dual issue
-#define CORE_MODEL core_model::subcore  // subcore model or shared model
-#define DRAM_MODEL dram_model::GDDR6    // memory type
-#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
-
-// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
-// see slide 22 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define SASS_hmma_per_PTX_wmma 2
-
-// These vars are almost constant between HW generation
-// see slide 24 from Nvidia at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/common/common.h b/util/tuner/GPU_Microbenchmark/hw_def/common/common.h
deleted file mode 100644
index bd07f5c6c..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/common/common.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef COMMON_H
-#define COMMON_H
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define ACCEL_SIM_MODE 1
-
-enum issue_model { single = 1, dual = 2 };
-
-static const char *issue_model_str[] = {"none", "single", "dual"};
-
-enum core_model { shared = 0, subcore = 1 };
-
-static const char *core_model_str[] = {"none", "shared", "subcore"};
-
-enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };
-
-// GPU error check
-#define gpuErrchk(ans)                                                         \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort)
-      exit(code);
-  }
-}
-
-// source:
-// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
-unsigned round_up_2n(unsigned v) {
-  v--;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  v++;
-
-  return v;
-}
-
-unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
-
-bool isPowerOfTwo(int n) {
-  if (n == 0)
-    return false;
-
-  return (ceil(log2(n)) == floor(log2(n)));
-}
-
-static const char *dram_model_str[] = {"none", "GDDR5", "GDDR5X", "GDDR6",
-                                       "HBM"};
-static const unsigned dram_model_bus_width[] = {0, 32, 32, 16, 128}; // in bits
-static const unsigned dram_model_mem_per_ctrlr[] = {0, 1, 1, 1, 1};
-static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
-static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
-// atom size =
-// dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
-unsigned get_atom_size_inByte(enum dram_model model) {
-  return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
-         dram_model_burst_length[model];
-}
-// CCD = dram_model_burst_length/dram_model_freq_ratio
-unsigned get_adjusted_CCD(enum dram_model model) {
-  assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
-  return dram_model_burst_length[model] / dram_model_freq_ratio[model];
-}
-
-unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
-  unsigned channel_width =
-      dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
-  assert(total_memory_width % channel_width == 0);
-  return total_memory_width / channel_width;
-}
-
-// DDR timing struct
-struct DDR_Timing {
-  unsigned freq;
-  unsigned nbk;
-  unsigned CCD;
-  unsigned RRD;
-  unsigned RCD;
-  unsigned RAS;
-  unsigned RP;
-  unsigned RC;
-  unsigned CL;
-  unsigned WL;
-  unsigned CDLR;
-  unsigned WR;
-  unsigned nbkgrp;
-  unsigned CCDL;
-  unsigned RTPL;
-
-  DDR_Timing(unsigned mfreq, unsigned n_bk, unsigned tCCD, unsigned tRRD,
-             unsigned tRCD, unsigned tRAS, unsigned tRP, unsigned tRC,
-             unsigned tCL, unsigned tWL, unsigned tCDLR, unsigned tWR,
-             unsigned n_bkgrp, unsigned tCCDL, unsigned tRTPL) {
-    freq = mfreq;
-    nbk = n_bk;
-    CCD = tCCD;
-    RRD = tRRD;
-    RCD = tRCD;
-    RAS = tRAS;
-    RP = tRP;
-    RC = tRC;
-    CL = tCL;
-    WL = tWL;
-    CDLR = tCDLR;
-    WR = tWR;
-    nbkgrp = n_bkgrp;
-    CCDL = tCCDL;
-    RTPL = tRTPL;
-  }
-
-  void scale_timing_for_new_freq(float newfreq) {
-    float freq_scale = freq / newfreq;
-    RRD = ceil(RRD / freq_scale);
-    RCD = ceil(RCD / freq_scale);
-    RAS = ceil(RAS / freq_scale);
-    RP = ceil(RP / freq_scale);
-    RC = ceil(RC / freq_scale);
-    CL = ceil(CL / freq_scale);
-    WL = ceil(WL / freq_scale);
-    CDLR = ceil(CDLR / freq_scale);
-    WR = ceil(WR / freq_scale);
-    CCDL = ceil(CCDL / freq_scale);
-    RTPL = ceil(RTPL / freq_scale);
-  }
-};
-
-// GDDR5 timing from hynix H5GQ1H24AFR
-//-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
-//                        CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
-
-static const DDR_Timing GDDR5_Timing_1800MHZ(1800, 16, 2, 6, 12, 28, 12, 40, 12,
-                                             4, 5, 12, 4, 3, 2);
-
-// HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017
-// paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
-// Timing for 1 GHZ:
-//-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
-//                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
-
-static const DDR_Timing HBM_Timing_1000MHZ(1000, 16, 1, 4, 14, 33, 14, 47, 14,
-                                           2, 3, 12, 4, 2, 4);
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h b/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h
deleted file mode 100644
index 56070e74f..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef DEVICE_QUERY_H
-#define DEVICE_QUERY_H
-
-#include <cuda_runtime.h>
-
-unsigned SM_NUMBER;           // number of SMs
-unsigned WARP_SIZE;           // max threads per warp
-unsigned MAX_THREADS_PER_SM;  // max threads / sm
-unsigned MAX_SHARED_MEM_SIZE; // Max configerable shared memory size in bytes
-unsigned MAX_WARPS_PER_SM;    // max warps / sm
-unsigned MAX_REG_PER_SM;      // max warps / sm
-
-unsigned MAX_THREAD_BLOCK_SIZE;         // max threads per threadblock
-unsigned MAX_SHARED_MEM_SIZE_PER_BLOCK; // Max configerable shared memory size
-                                        // per block in bytes
-unsigned
-    MAX_REG_PER_BLOCK; // Max configerable shared memory size per block in bytes
-
-size_t L2_SIZE; // L2 size in bytes
-
-size_t MEM_SIZE;            // Memory size in bytes
-unsigned MEM_CLK_FREQUENCY; // Memory clock freq in MHZ
-unsigned MEM_BITWIDTH;      // Memory bit width
-
-// launched threadblocks
-unsigned THREADS_PER_BLOCK;
-unsigned BLOCKS_PER_SM;
-unsigned THREADS_PER_SM;
-unsigned BLOCKS_NUM;
-unsigned TOTAL_THREADS;
-
-cudaDeviceProp deviceProp;
-
-unsigned intilizeDeviceProp(unsigned deviceID) {
-  cudaSetDevice(deviceID);
-  cudaGetDeviceProperties(&deviceProp, deviceID);
-
-  // core stats
-  SM_NUMBER = deviceProp.multiProcessorCount;
-  MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
-  MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
-  WARP_SIZE = deviceProp.warpSize;
-  MAX_WARPS_PER_SM =
-      deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
-  MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
-
-  // threadblock stats
-  MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
-  MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
-  MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
-
-  // launched thread blocks to ensure GPU is fully occupied as much as possible
-  THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
-  BLOCKS_PER_SM =
-      deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
-  THREADS_PER_SM = BLOCKS_PER_SM * THREADS_PER_BLOCK;
-  BLOCKS_NUM = BLOCKS_PER_SM * SM_NUMBER;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  // L2 cache
-  L2_SIZE = deviceProp.l2CacheSize;
-
-  // memory
-  MEM_SIZE = deviceProp.totalGlobalMem;
-  MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
-  MEM_BITWIDTH = deviceProp.memoryBusWidth;
-
-  return 1;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h
deleted file mode 100644
index 1fd2087c0..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef HW_DEF_H
-#define HW_DEF_H
-
-//#include "kepler_TITAN_hw_def.h"
-
-//#include "pascal_TITANX_hw_def.h"
-
-//#include "volta_QV100_hw_def.h"
-
-//#include "turing_RTX2060_hw_def.h"
-
-//#include "ampere_RTX3070_hw_def.h"
-
-// #include "volta_TITANV_hw_def.h"
-
-#include "ampere_A100_hw_def.h"
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h
deleted file mode 100644
index 1f4c6212e..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Kepler TITAN HW def file
-#ifndef KEPLER_TITAN_DEF_H
-#define KEPLER_TITAN_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (64 * 1024) // Max L1 size in bytes, when enabled
-
-#define CLK_FREQUENCY 837 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::dual
-#define CORE_MODEL core_model::shared
-#define DRAM_MODEL dram_model::GDDR5
-
-#define WARP_SCHEDS_PER_SM 4
-
-// no tensor cores in kepler
-#define SASS_hmma_per_PTX_wmma 0
-
-// These vars are almost constant between HW generations
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h
deleted file mode 100644
index b68329a73..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Pascal ITIANX HW def file
-#ifndef PASCAL_TITANX_DEF_H
-#define PASCAL_TITANX_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (24 * 1024) // Max L1 size in bytes, when enabled
-
-#define CLK_FREQUENCY 1417 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::dual
-#define CORE_MODEL core_model::subcore
-#define DRAM_MODEL dram_model::GDDR5X
-#define WARP_SCHEDS_PER_SM 4
-
-// no tensor cores in pascal
-#define SASS_hmma_per_PTX_wmma 0
-
-// These vars are almost constant between HW generations
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h
deleted file mode 100644
index e7d728d15..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// These are the configration parameters that can be found publicly sources
-// Turing HW def file
-#ifndef TURING_RTX2070_DEF_H
-#define TURING_RTX2070_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (64 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1365 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single   // single issue core or dual issue
-#define CORE_MODEL core_model::subcore    // subcore model or shared model
-#define DRAM_MODEL dram_model::GDDR6      // memory type
-#define WARP_SCHEDS_PER_SM 4              // number of warp schedulers per SM
-
-// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
-#define SASS_hmma_per_PTX_wmma 4
-
-// These vars are almost constant between HW generation
-// see slide 24 from Nvidia at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h
deleted file mode 100644
index c307e3266..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// These are the configration parameters that can be found publicly
-// Volta QV100 HW def file (sm_70)
-// Data source:
-// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
-
-#ifndef VOLTA_QV100_HW_DEF_H
-#define VOLTA_QV100_HW_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (128 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1132 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single
-#define CORE_MODEL core_model::subcore
-#define DRAM_MODEL dram_model::HBM
-#define WARP_SCHEDS_PER_SM 4
-
-// see slide 22 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-// number of SASS HMMA per 16x16 PTX WMMA for FP16 operands - FP32  accumlate operation
-#define SASS_hmma_per_PTX_wmma 16
-
-// These vars are almost constant between HW generation
-// see slide 24 from Nvidia at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h
deleted file mode 100644
index 3548d66e1..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// These are the configration parameters that can be found publicly
-// Volta TITANV HW def file (sm_70)
-// Data source:
-// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
-
-#ifndef VOLTA_TITANVV100_HW_DEF_H
-#define VOLTA_TITANV_HW_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (128 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1200 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single
-#define CORE_MODEL core_model::subcore
-#define DRAM_MODEL dram_model::HBM
-#define WARP_SCHEDS_PER_SM 4
-
-// see slide 22 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
- // number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
-#define SASS_hmma_per_PTX_wmma  16
-
-// These vars are almost constant between HW generation
-// see slide 24 from Nvidia at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h
deleted file mode 100644
index d25b24efa..000000000
--- a/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// These are the configration parameters that can be found publicly
-// Volta QV100 HW def file (sm_70)
-// Data source:
-// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
-
-#ifndef VOLTA_V100_HW_DEF_H
-#define VOLTA_V100_HW_DEF_H
-
-#include "./common/common.h"
-#include "./common/deviceQuery.h"
-
-#define L1_SIZE (128 * 1024) // Max L1 size in bytes
-
-#define CLK_FREQUENCY 1455 // frequency in MHz
-
-#define ISSUE_MODEL issue_model::single
-#define CORE_MODEL core_model::subcore
-#define DRAM_MODEL dram_model::HBM
-#define WARP_SCHEDS_PER_SM 4
-
-// see slide 22 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
-#define SASS_hmma_per_PTX_wmma  16
-
-// These vars are almost constant between HW generations
-// see slide 24 at
-// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-// each memory channel is supported by 2 L2 banks
-#define L2_BANKS_PER_MEM_CHANNEL 2
-#define L2_BANK_WIDTH_in_BYTE 32
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/output.file b/util/tuner/GPU_Microbenchmark/output.file
deleted file mode 100644
index 0a78d971a..000000000
--- a/util/tuner/GPU_Microbenchmark/output.file
+++ /dev/null
@@ -1,509 +0,0 @@
-running ./L1asso.csv microbenchmark
-/////////////////////////////////
-running ./L1line.csv microbenchmark
-/////////////////////////////////
-running ./MSHR100_array1073741824_shmem12288_itr6.csv microbenchmark
-/////////////////////////////////
-running ./MaxFlops_double microbenchmark
-DPU FLOP per SM = 63.930252 (flop/clk/SM)
-Total Clk number = 524860
-/////////////////////////////////
-running ./MaxFlops_float microbenchmark
-FLOP per SM = 126.861778 (flop/clk/SM)
-Total Clk number = 66124
-/////////////////////////////////
-running ./MaxFlops_half microbenchmark
-half FLOP per SM = 249.334442 (flop/clk/SM)
-Total Clk number = 16822
-/////////////////////////////////
-running ./MaxFlops_int32 microbenchmark
-int32 FLOP per SM = 126.886719 (flop/clk/SM)
-Total Clk number = 66111
-/////////////////////////////////
-running ./atomic_add_bw microbenchmark
-Atomic int32 bandwidth = 0.000026 (byte/clk)
-Total Clk number = 408780625932820
-/////////////////////////////////
-running ./atomic_add_bw_conflict microbenchmark
-Atomic int32 bandwidth = 0.464460 (byte/clk)
-Total Clk number = 1444878939
-/////////////////////////////////
-running ./atomic_add_lat microbenchmark
-Atomic int32 latency = 243.626953 (clk)
-Total Clk number = 249474
-/////////////////////////////////
-running ./config_dpu microbenchmark
-DPU FLOP per SM = 63.925381 (flop/clk/SM)
-Total Clk number = 524900
-double-precision DPU latency = 8.064270 (clk)
-Total Clk number = 132125
-
-//Accel_Sim config:
--gpgpu_num_dp_units 4
--ptx_opcode_latency_dp 8,8,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--trace_opcode_latency_initiation_dp 8,4
-/////////////////////////////////
-running ./config_fpu microbenchmark
-FLOP per SM = 126.865616 (flop/clk/SM)
-Total Clk number = 66122
-float-precision FPU latency = 4.119690 (clk)
-Total Clk number = 67497
-
-//Accel_Sim config:
--gpgpu_num_sp_units 4
--ptx_opcode_latency_fp 4,4,4,4,39
--ptx_opcode_initiation_fp 2,2,2,2,4
--trace_opcode_latency_initiation_sp 4,2
-/////////////////////////////////
-running ./config_int microbenchmark
-int32 FLOP per SM = 126.886719 (flop/clk/SM)
-Total Clk number = 66111
-int32 latency = 4.313965 (clk)
-Total Clk number = 17670
-
-//Accel_Sim config:
--gpgpu_num_int_units 4
--ptx_opcode_latency_int 4,4,4,4,21
--ptx_opcode_initiation_int 2,2,2,2,2
--trace_opcode_latency_initiation_int 4,2
-/////////////////////////////////
-running ./config_sfu microbenchmark
-SFU fast sqrt bw = 15.9759(flops/clk/SM)
-Total Clk number = 262539
-SFU fast sqrt latency = 21.1096(clk)
-Total Clk number = 86465
-
-//Accel_Sim config:
--gpgpu_num_sfu_units 4
--ptx_opcode_latency_sfu 21
--ptx_opcode_initiation_sfu 8
--trace_opcode_latency_initiation_sfu 21,8
-/////////////////////////////////
-running ./config_tensor microbenchmark
-wmma PTX issue bandwidth = 3.73122(thread/clk/SM)
-hmma SASS issue bandwidth = 59.6994(thread/clk/SM)
-FMA tensor bandwidth = 477.596(FMA/clk/SM)
-Total Clk number = 562056
-wmma latency = 35.3401(clk)
-hmma latency = 2.20876(clk)
-Total Clk number = 144753
-
-//Accel_Sim config:
--gpgpu_tensor_core_avail 1
--gpgpu_num_tensor_core_units 4
--ptx_opcode_latency_tesnor 35
--ptx_opcode_initiation_tensor 32
--trace_opcode_latency_initiation_tensor 2,2
--specialized_unit_3 1,4,2,4,4,TENSOR
--trace_opcode_latency_initiation_spec_op_3 2,2
-/////////////////////////////////
-running ./config_udp microbenchmark
--specialized_unit_4 1,4,4,4,4,UDP
--trace_opcode_latency_initiation_spec_op_4 4,1
-/////////////////////////////////
-running ./core_config microbenchmark
-CUDA version number = 7.0
-
-//Accel_Sim config:
--gpgpu_ptx_force_max_capability 70
--gpgpu_shader_registers 65536
--gpgpu_registers_per_block 65536
--gpgpu_occupancy_sm_number 70
--gpgpu_coalesce_arch 70
--gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
--gpgpu_sub_core_model 1
--gpgpu_enable_specialized_operand_collector 0
--gpgpu_operand_collector_num_units_gen 8
--gpgpu_operand_collector_num_in_ports_gen 8
--gpgpu_operand_collector_num_out_ports_gen 8
--gpgpu_num_sched_per_core 4
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
--gpgpu_inst_fetch_throughput 4
--gpgpu_shader_core_pipeline 2048:32
--gpgpu_shader_cta 32
-/////////////////////////////////
-running ./data.csv microbenchmark
-/////////////////////////////////
-running ./deviceQuery microbenchmark
-  Device : "TITAN V"
-
-  CUDA version number                         : 7.0
-  GPU Max Clock rate                             : 1455 MHz
-  Multiprocessors Count                       : 80
-  Maximum number of threads per multiprocessor: 2048
-  CUDA Cores per multiprocessor               : 64
-  Registers per multiprocessor                : 65536
-  Shared memory per multiprocessor            : 98304 bytes
-  Warp size                                   : 32
-  Maximum number of threads per block         : 1024
-  Shared memory per block                     : 49152 bytes
-  Registers per block                         : 65536
-  globalL1CacheSupported                      : 1
-  localL1CacheSupported                       : 1
-  L2 Cache Size                             : 4 MB
-  Global memory size                        : 12 GB
-  Memory Clock rate                           : 850 Mhz
-  Memory Bus Width                            : 3072 bit
- //////////////////////////
-  Device : "GeForce RTX 2060"
-
-  CUDA version number                         : 7.5
-  GPU Max Clock rate                             : 1710 MHz
-  Multiprocessors Count                       : 30
-  Maximum number of threads per multiprocessor: 1024
-  CUDA Cores per multiprocessor               : 64
-  Registers per multiprocessor                : 65536
-  Shared memory per multiprocessor            : 65536 bytes
-  Warp size                                   : 32
-  Maximum number of threads per block         : 1024
-  Shared memory per block                     : 49152 bytes
-  Registers per block                         : 65536
-  globalL1CacheSupported                      : 1
-  localL1CacheSupported                       : 1
-  L2 Cache Size                             : 3 MB
-  Global memory size                        : 6 GB
-  Memory Clock rate                           : 7001 Mhz
-  Memory Bus Width                            : 192 bit
- //////////////////////////
-  Device : "GeForce GTX TITAN X"
-
-  CUDA version number                         : 5.2
-  GPU Max Clock rate                             : 1076 MHz
-  Multiprocessors Count                       : 24
-  Maximum number of threads per multiprocessor: 2048
-  CUDA Cores per multiprocessor               : 128
-  Registers per multiprocessor                : 65536
-  Shared memory per multiprocessor            : 98304 bytes
-  Warp size                                   : 32
-  Maximum number of threads per block         : 1024
-  Shared memory per block                     : 49152 bytes
-  Registers per block                         : 65536
-  globalL1CacheSupported                      : 1
-  localL1CacheSupported                       : 1
-  L2 Cache Size                             : 3 MB
-  Global memory size                        : 12 GB
-  Memory Clock rate                           : 3505 Mhz
-  Memory Bus Width                            : 384 bit
- //////////////////////////
-  Device : "Quadro P2200"
-
-  CUDA version number                         : 6.1
-  GPU Max Clock rate                             : 1493 MHz
-  Multiprocessors Count                       : 10
-  Maximum number of threads per multiprocessor: 2048
-  CUDA Cores per multiprocessor               : 128
-  Registers per multiprocessor                : 65536
-  Shared memory per multiprocessor            : 98304 bytes
-  Warp size                                   : 32
-  Maximum number of threads per block         : 1024
-  Shared memory per block                     : 49152 bytes
-  Registers per block                         : 65536
-  globalL1CacheSupported                      : 1
-  localL1CacheSupported                       : 1
-  L2 Cache Size                             : 1 MB
-  Global memory size                        : 5 GB
-  Memory Clock rate                           : 5005 Mhz
-  Memory Bus Width                            : 160 bit
- //////////////////////////
-/////////////////////////////////
-running ./kernel_lat microbenchmark
-Kernel Launch Latency = 7257.6 cycles
-The reported latency above can be slightly higher than real. For accurate evaultion using nvprof event, exmaple: make events ./kernel_lat
-
-//Accel_Sim config:
--gpgpu_kernel_launch_latency  7257
-/////////////////////////////////
-running ./l1_access_grain microbenchmark
-
-This benchmark measures coalescing granularity for differnet strides.
-check the nvprof or nvsight for received l1 reads and writes.
-to run the program with nsight: make nvsight ./l1_access_grain
-stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum
-
-/////////////////////////////////
-running ./l1_adaptive microbenchmark
-The ubench is not imepleneted yet.
-/////////////////////////////////
-running ./l1_associativity microbenchmark
-Launching L1 cache line size ubench
-Saving L1 cache line size data at L1line.csv
-Launching L1 cache assoc ubench
-Saving L1 cache assoc data at L1asso.csv
-/////////////////////////////////
-running ./l1_banks microbenchmark
-The ubench is not imepleneted yet.
-/////////////////////////////////
-running ./l1_bw_128 microbenchmark
-L1 bandwidth = 116.437(byte/clk/SM), 130.129(GB/s/SM)
-Total Clk number = 36022
-/////////////////////////////////
-running ./l1_bw_32f microbenchmark
-L1 bandwidth = 78.3484(byte/clk/SM), 87.5612(GB/s/SM)
-Total Clk number = 53534
-/////////////////////////////////
-running ./l1_bw_32f_unroll microbenchmark
-L1 bandwidth = 54.837540 (byte/clk/SM)
-Total Clk number = 76486
-/////////////////////////////////
-running ./l1_bw_64f microbenchmark
-L1 bandwidth = 122.759(byte/clk/SM), 137.194(GB/s/SM)
-Total Clk number = 34167
-/////////////////////////////////
-running ./l1_bw_64v microbenchmark
-L1 bandwidth = 113.883(byte/clk/SM), 127.274(GB/s/SM)
-Total Clk number = 18415
-/////////////////////////////////
-running ./l1_config microbenchmark
-
-//Accel_Sim config:
--gpgpu_adaptive_cache_config 1
--gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4,128,64,L:L:m:N:L,A:512:64,16:0,32
--gpgpu_gmem_skip_L1D 0
-/////////////////////////////////
-running ./l1_lat microbenchmark
-L1 Latency  =      33.7331 cycles
-Total Clk number = 1105365
-
-//Accel_Sim config:
--gpgpu_l1_latency  = 33
-/////////////////////////////////
-running ./l1_mshr microbenchmark
-Launching L1 MSHR ubench
-Saving L1 MSHR data at MSHR100_array1073741824_shmem12288_itr6.csv
-/////////////////////////////////
-running ./l1_sector microbenchmark
-Launching L1 sector ubench
-Saving L1 sector data at data.csv
-/////////////////////////////////
-running ./l1_shared_bw microbenchmark
-Shared Memory Bandwidth = 99.708586 (byte/clk/SM)
-Total Clk number = 336525
-/////////////////////////////////
-running ./l1_write_policy microbenchmark
-
-This microbenchmark detects L1 write policy.
-check the nvprof or nvsight for received l1 reads and writes to detect the policy.
-see the code comments for further details
-to run the program with nvsight: make nvsight ./l1_write_policy
-stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum
-
-/////////////////////////////////
-running ./l2_access_grain microbenchmark
-
-This benchmark measures l2 access granularity for differnet strides.
-check the nvprof or nvsight for received l2 reads and write.
-to run the program with nsight: make nvsight ./l2_access_grain
-stats to look at: lts__t_sectors_srcunit_tex_op_read.sum and lts__t_sectors_srcunit_tex_op_write.sum
-
-/////////////////////////////////
-running ./l2_bw_128 microbenchmark
-L2 bandwidth = 1365.73(byte/clk), 1526.33(GB/s)
-Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s)
-L2 BW achievable = 88.9149%
-Total Clk number = 491376
-/////////////////////////////////
-running ./l2_bw_32f microbenchmark
-L2 bandwidth = 1365.42(byte/clk), 1525.97(GB/s)
-Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s)
-L2 BW achievable = 88.8942%
-Total Clk number = 982981
-/////////////////////////////////
-running ./l2_bw_64f microbenchmark
-L2 bandwidth = 1384.53(byte/clk), 1547.33(GB/s)
-Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s)
-L2 BW achievable = 90.1385%
-Total Clk number = 1938823
-/////////////////////////////////
-running ./l2_config microbenchmark
-L2 Cache Size = 4 MB
-L2 Banks number = 48
-
-//Accel_Sim config:
--gpgpu_n_sub_partition_per_mchannel 2
--icnt_flit_size 40
--gpgpu_memory_partition_indexing 0
--gpgpu_cache:dl2 S:32,128,24,L:B:m:L:P,A:192:4,32:0,32
-/////////////////////////////////
-running ./l2_copy_engine microbenchmark
-L2 Latency no-warmp up =     213.6997 cycles
-Total Clk number = 7002512
-L2 Hit Latency =     220.1863 cycles
-Total Clk number = 7215066
-Is memcpy cached in L2? Yes, error=2.9
-
-//Accel_Sim config:
--gpgpu_perf_sim_memcpy 1
-/////////////////////////////////
-running ./l2_lat microbenchmark
-L2 Hit Latency =     211.0720 cycles
-Total Clk number = 6916406
-L1 Latency  =      33.7729 cycles
-Total Clk number = 1106672
-
-//Accel_Sim config:
--gpgpu_l2_rop_latency 177
-/////////////////////////////////
-running ./l2_write_policy microbenchmark
-
-This microbenchmark detects L1 write policy.
-check the nvprof or nvsight for received l1 reads and writes to detect the policy.
-see the code comments for further details
-to run the program with nvsight: make nvsight ./l1_write_policy
-stats to look at: llts__t_sectors_srcunit_tex_op_read.sum & lts__t_sectors_srcunit_tex_op_write.sum & lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum & lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum
-
-/////////////////////////////////
-running ./lat_double microbenchmark
-double-precision DPU latency = 8.075317 (clk)
-Total Clk number = 132306
-/////////////////////////////////
-running ./lat_float microbenchmark
-float-precision FPU latency = 4.128845 (clk)
-Total Clk number = 67647
-/////////////////////////////////
-running ./lat_half microbenchmark
-fpu16 latency = 6.180664 (clk)
-Total Clk number = 25316
-/////////////////////////////////
-running ./lat_int32 microbenchmark
-int32 latency = 4.349609 (clk)
-Total Clk number = 17816
-/////////////////////////////////
-running ./list_devices microbenchmark
-
-Device 0: "TITAN V sm_7.0"
-
-Device 1: "GeForce RTX 2060 sm_7.5"
-
-Device 2: "GeForce GTX TITAN X sm_5.2"
-
-Device 3: "Quadro P2200 sm_6.1"
-/////////////////////////////////
-running ./mem_atom_size microbenchmark
-
-This benchmark measures mem atom size granularity
-check the nvprof or nvsight for received mem reads and writes
-to run the program with nsight: make nvsight ./l2_access_grain
-stats to look at: dram__sectors_read.sum & dram__sectors_write.sum & dram__bytes_read.sum & dram__sectors_read.sum
-
-we launched 2359296 read memory reqs (1 req per thread) with a stride of 32 (128 bytes)
-if the number of memory reads is the same as read reqs, then mem atom size is 32B
-if the number of memory reads is 2X issued read reqs, then mem atom size is 64B, etc.
-
-/////////////////////////////////
-running ./mem_bw microbenchmark
-Mem BW= 445.770477 (Byte/Clk)
-Mem BW= 521.045920 (GB/sec)
-Max Theortical Mem BW= 652.799988 (GB/sec)
-Mem Efficiency = 68.285919 %
-Total Clk number = 127023
-/////////////////////////////////
-running ./mem_config microbenchmark
-Global memory size = 12 GB
-Memory Clock rate = 850 Mhz
-Memory Bus Width = 3072 bit
-Memory type = HBM
-Memory channels = 24
-
-//Accel_Sim config:
--gpgpu_n_mem 24
--gpgpu_n_mem_per_ctrlr 1
--gpgpu_dram_buswidth 16
--gpgpu_dram_burst_length 2
--dram_data_command_freq_ratio 2
--dram_dual_bus_interface 1
--gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4
-/////////////////////////////////
-running ./mem_lat microbenchmark
-Mem latency =     313.4630 cycles
-Total Clk number = 2567889
-L2 Hit Latency =     209.9695 cycles
-Total Clk number = 6880281
-
-//Accel_Sim config:
--dram_latency 104
-/////////////////////////////////
-running ./regfile_bw microbenchmark
-wmma PTX issue bandwidth = 3.73473(thread/clk/SM)
-hmma SASS issue bandwidth = 59.7557(thread/clk/SM)
-FMA tensor bandwidth = 478.046(FMA/clk/SM)
-Total Clk number = 561527
-
-regfile_bw = 2048 (byte/SM)
-
-//Accel_Sim config:
--gpgpu_num_reg_banks 16
--gpgpu_reg_file_port_throughput 2
-/////////////////////////////////
-running ./sfu_bw_fsqrt microbenchmark
-SFU fast sqrt bw = 15.976(flops/clk/SM)
-Total Clk number = 262538
-/////////////////////////////////
-running ./sfu_lat_fsqrt microbenchmark
-SFU fast sqrt latency = 21.1453(clk)
-Total Clk number = 86611
-/////////////////////////////////
-running ./shared_bw microbenchmark
-Shared Memory Bandwidth = 126.48(byte/clk/SM), 141.353(GB/s/SM)
-Total Clk number = 132647
-/////////////////////////////////
-running ./shared_bw_64 microbenchmark
-Shared Memory Bandwidth = 127.932(byte/clk/SM), 142.975(GB/s/SM)
-Total Clk number = 262283
-/////////////////////////////////
-running ./shared_lat microbenchmark
-Shared Memory Latency  = 27.010254 cycles
-Total Clk number = 55317
-
-//Accel_Sim config:
--gpgpu_smem_latency 27
-/////////////////////////////////
-running ./shd_config microbenchmark
-Shared memory per multiprocessor = 98304 bytes
-Shared memory per block = 49152 bytes
-
-//Accel_Sim config:
--gpgpu_shmem_size 98304
--gpgpu_shmem_sizeDefault 98304
--gpgpu_shmem_per_block 49152
-/////////////////////////////////
-running ./system_config microbenchmark
-Device Name = TITAN V
-GPU Max Clock rate = 1455 MHz
-GPU Base Clock rate = 1200 MHz
-SM Count : 80
-CUDA version number = 7.0
-
-//Accel_Sim config:
--gpgpu_compute_capability_major 7
--gpgpu_compute_capability_minor 0
--gpgpu_n_clusters 80
--gpgpu_n_cores_per_cluster 1
--gpgpu_clock_domains 1200:1200:1200:850
-/////////////////////////////////
-running ./tensor_bw_half microbenchmark
-FP16 operand, FP32 accumalte:
-wmma PTX issue bandwidth = 3.74006(thread/clk/SM)
-hmma SASS issue bandwidth = 59.8409(thread/clk/SM)
-FMA tensor bandwidth = 478.728(FMA/clk/SM)
-Total Clk number = 560727
-
-FP16 operand, FP16 accumalte:
-wmma PTX issue bandwidth = 3.97989(thread/clk/SM)
-hmma SASS issue bandwidth = 63.6783(thread/clk/SM)
-FMA tensor bandwidth = 509.426(FMA/clk/SM)
-Total Clk number = 526937
-/////////////////////////////////
-running ./tensor_lat_half microbenchmark
-FP16 operand, FP32 accumalte:
-wmma latency = 35.3523(clk)
-hmma latency = 2.20952(clk)
-Total Clk number = 144803
-
-FP16 operand, FP16 accumalte:
-wmma latency = 33.0029(clk)
-hmma latency = 2.06268(clk)
-Total Clk number = 135180
-/////////////////////////////////
diff --git a/util/tuner/GPU_Microbenchmark/run_all.sh b/util/tuner/GPU_Microbenchmark/run_all.sh
deleted file mode 100755
index 40a924e5b..000000000
--- a/util/tuner/GPU_Microbenchmark/run_all.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#! /bin/sh
-
-THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-
-cd ${SCRIPT_DIR}/bin/
-for f in ./*; do
-    echo "running $f microbenchmark"
-    $f
-    echo "/////////////////////////////////"
-done
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile
deleted file mode 100644
index 032a57e18..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-SRC = atomic_add_bw.cu
-
-EXE = atomic_add_bw
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu
deleted file mode 100644
index eb26a7e68..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 2048
-
-template <class T>
-__global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1,
-                          T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  // register T s1 = data1[gid];
-  // register T s2 = data2[gid];
-  // register T result = 0;
-  // synchronize all threads
-  // int32_t res0, res1, res2, res3, res4, res5, res6, res7, res8, res9, res10,
-  // res11, res12, res13, res14, res15;
-  int32_t sum;
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = clock64();
-
-  for (uint32_t i = 0; i < REPEAT_TIMES; i++) {
-    sum = sum + atomicAdd(&data1[(i * warpSize) + gid], 10);
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = clock64();
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = sum;
-}
-
-int main() {
-
-  intilizeDeviceProp(0);
-  unsigned ARRAY_SIZE = TOTAL_THREADS + (REPEAT_TIMES * WARP_SIZE);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-
-  int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *data1 = (int32_t *)malloc(ARRAY_SIZE * sizeof(int32_t));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  int32_t *data1_g;
-  int32_t *res_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++) {
-    data1[i] = (int32_t)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&data1_g, ARRAY_SIZE * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, ARRAY_SIZE * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-
-  atomic_bw<int32_t><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
-                                                        data1_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyDeviceToHost));
-
-  float bw;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  // uint64_t total_time = stopClk[0]-startClk[0];
-
-  bw = (((float)REPEAT_TIMES * (float)TOTAL_THREADS * 4 * 8) /
-        (float)(total_time));
-  printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
-  printf("Total Clk number = %ld \n", total_time);
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile
deleted file mode 100644
index b85a3d827..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-SRC = atomic_add_bw_conflict.cu
-
-EXE = atomic_add_bw_conflict
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu
deleted file mode 100644
index 193c16b9c..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-template <class T>
-__global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                          T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  uint32_t sum;
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    sum = sum + atomicAdd(&data1[0], 10);
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = sum;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  int32_t *data1_g;
-  int32_t *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (int32_t)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-
-  atomic_bw<int32_t><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
-                                                        data1_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyDeviceToHost));
-
-  float bw;
-  uint32_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  // uint32_t total_time = stopClk[0] - startClk[0];
-  bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * 4) / (float)(total_time));
-  printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
-  printf("Total Clk number = %u \n", total_time);
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile
deleted file mode 100644
index 27e1f984f..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-SRC = atomic_add_lat.cu
-
-EXE = atomic_add_lat
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu
deleted file mode 100644
index d31cbcb48..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-template <class T>
-__global__ void atmoic_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                               T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  // register T s1 = data1[gid];
-  // register T s2 = data2[gid];
-  // register T result = 0;
-  uint32_t index = 0;
-  int32_t offset = 10;
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    index = atomicAdd(&data1[index], offset);
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = data1[0];
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  int32_t *data1 = (int32_t *)malloc(REPEAT_TIMES * sizeof(int32_t));
-  int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  int32_t *data1_g;
-  int32_t *res_g;
-
-  int32_t stride = 1;
-
-  for (int32_t i = 0; i < (REPEAT_TIMES); i++)
-    data1[i] = (i + stride) % REPEAT_TIMES;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, REPEAT_TIMES * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMemcpy(data1_g, data1, REPEAT_TIMES * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-
-  atmoic_latency<int32_t><<<1, 1>>>(startClk_g, stopClk_g, data1_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES));
-  printf("Atomic int32 latency = %f (clk)\n", latency);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile
deleted file mode 100644
index eccd44b1b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = MaxFlops_double.cu
-
-EXE = MaxFlops_double
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu
deleted file mode 100644
index 2b01f7cad..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "MaxFlops_double.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  dpu_max_flops();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h
deleted file mode 100644
index bcac5309a..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef MAXFLOPS_DOUBLE_DEF_H
-#define MAXFLOPS_DOUBLE_DEF_H
-
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 4096
-
-template <class T>
-__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                          T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+d"(result), "+d"(s1), "+d"(s2));
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float dpu_max_flops() {
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  double *data1 = (double *)malloc(TOTAL_THREADS * sizeof(double));
-  double *data2 = (double *)malloc(TOTAL_THREADS * sizeof(double));
-  double *res = (double *)malloc(TOTAL_THREADS * sizeof(double));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  double *data1_g;
-  double *data2_g;
-  double *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (double)i;
-    data2[i] = (double)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(double)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(double)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(double)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyHostToDevice));
-
-  max_flops<double><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
-                                                       data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyDeviceToHost));
-
-  float flops;
-  flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 8) /
-          ((float)(stopClk[0] - startClk[0]));
-  printf("DPU FLOP per SM = %f (flop/clk/SM)\n", flops);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return flops;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile
deleted file mode 100644
index 36acf26ef..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = MaxFlops_float.cu
-
-EXE = MaxFlops_float
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu
deleted file mode 100644
index b9482fc96..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "MaxFlops_float.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  fpu_max_flops();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h
deleted file mode 100644
index 88c1a82f2..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef MAXFLOPS_FLOAT_DEF_H
-#define MAXFLOPS_FLOAT_DEF_H
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_ITERS 1024
-
-template <class T>
-__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                          T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_ITERS; ++j) {
-    asm volatile("{\t\n"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+f"(result), "+f"(s1), "+f"(s2));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-int fpu_max_flops() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *data2 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *res = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  float *data1_g;
-  float *data2_g;
-  float *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (float)i;
-    data2[i] = (float)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  max_flops<float><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
-                                                      data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float flops;
-  flops = (float)(REPEAT_ITERS * TOTAL_THREADS * 8) /
-          ((float)(stopClk[0] - startClk[0]));
-  printf("FLOP per SM = %f (flop/clk/SM)\n", flops);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return flops;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile
deleted file mode 100644
index db878492c..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM50 :=
-
-SRC = MaxFlops_half.cu
-
-EXE = MaxFlops_half
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu
deleted file mode 100644
index 024d442df..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "MaxFlops_half.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  fpu16_max_flops();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h
deleted file mode 100644
index 15b2b803d..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef MAXFLOPS_FP16_DEF_H
-#define MAXFLOPS_FP16_DEF_H
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-__global__ void fpu16_max_flops(uint32_t *startClk, uint32_t *stopClk,
-                                half *data1, half *data2, half *data3,
-                                half *data4, half *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  half s2 = data2[gid];
-  half s4 = data4[gid];
-  half2 mult = __halves2half2(s2, s4);
-  half result1 = data1[gid];
-  half result2 = data3[gid];
-  half2 result = __halves2half2(result1, result2);
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    result = result * mult + result;
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = __high2half(result) + __low2half(result);
-}
-
-float fpu16_max_flops() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  half *data1 = (half *)malloc(TOTAL_THREADS * sizeof(half));
-  half *data2 = (half *)malloc(TOTAL_THREADS * sizeof(half));
-  half *res = (half *)malloc(TOTAL_THREADS * sizeof(half));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  half *data1_g;
-  half *data2_g;
-  half *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (half)i;
-    data2[i] = (half)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(half)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(half)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(half)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyHostToDevice));
-
-  fpu16_max_flops<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyDeviceToHost));
-
-  float flops;
-  flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 4) /
-          ((float)(stopClk[0] - startClk[0]));
-  printf("half FLOP per SM = %f (flop/clk/SM)\n", flops);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return flops;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile
deleted file mode 100644
index 63d6655f4..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = MaxFlops_int32.cu
-
-EXE = MaxFlops_int32
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu
deleted file mode 100644
index 01ab4e9b7..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "MaxFlops_int32.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  max_int32_flops();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h
deleted file mode 100644
index 55b5aa585..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef MAXFLOPS_INT32_DEF_H
-#define MAXFLOPS_INT32_DEF_H
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-template <class T>
-__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                          T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+r"(result), "+r"(s1), "+r"(s2));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float max_int32_flops() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *data2 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  int32_t *data1_g;
-  int32_t *data2_g;
-  int32_t *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (int32_t)i;
-    data2[i] = (int32_t)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-
-  max_flops<int32_t><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyDeviceToHost));
-
-  float flops;
-  flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 8) /
-          ((float)(stopClk[0] - startClk[0]));
-  printf("int32 FLOP per SM = %f (flop/clk/SM)\n", flops);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return flops;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile
deleted file mode 100644
index 2264d7d30..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = config_dpu.cu
-
-EXE = config_dpu
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu
deleted file mode 100644
index ffdbe8f81..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "../../../hw_def/hw_def.h"
-#include "../MaxFlops_double/MaxFlops_double.h"
-#include "../lat_double/lat_double.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  float flops = dpu_max_flops();
-  float latency = dpu_latency();
-
-  if (ACCEL_SIM_MODE) {
-    unsigned lat = (unsigned)latency;
-    // divide flops by 2 as we need FMA throughput
-    unsigned throughput_per_SM = round_up_2n(flops / 2);
-    float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM;
-
-    unsigned init = WARP_SIZE / throughput_per_sched;
-
-    //init cannot be larger than latency
-    if(init > latency)
-      latency = init;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_num_dp_units " << WARP_SCHEDS_PER_SM << std::endl;
-    std::cout << "-ptx_opcode_latency_dp " << lat << "," << lat << "," << lat
-              << "," << lat << ",330" << std::endl;
-    std::cout << "-ptx_opcode_initiation_dp " << init << "," << init << ","
-              << init << "," << init << ",130" << std::endl;
-    std::cout << "-trace_opcode_latency_initiation_dp " << lat << "," << init
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile
deleted file mode 100644
index b7a99d190..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = config_fpu.cu
-
-EXE = config_fpu
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu
deleted file mode 100644
index 44ed33b73..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-#include "../MaxFlops_float/MaxFlops_float.h"
-#include "../lat_float/lat_float.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  float flops = fpu_max_flops();
-  float latency = fpu_latency();
-
-  if (ACCEL_SIM_MODE) {
-    unsigned lat = (unsigned)latency;
-    // divide flops by 2 as we need FMA thoughput
-    unsigned throughput_per_SM = round_up_2n(flops / 2);
-    float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM;
-
-    unsigned init = WARP_SIZE / throughput_per_sched;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_num_sp_units " << WARP_SCHEDS_PER_SM << std::endl;
-    std::cout << "-ptx_opcode_latency_fp " << lat << "," << lat << "," << lat
-              << "," << lat << ",39" << std::endl;
-    std::cout << "-ptx_opcode_initiation_fp " << init << "," << init << ","
-              << init << "," << init << "," << init * 2 << std::endl;
-    std::cout << "-trace_opcode_latency_initiation_sp " << lat << "," << init
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile
deleted file mode 100644
index f8a024a41..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = config_int.cu
-
-EXE = config_int
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu
deleted file mode 100644
index 4bb925872..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-#include "../MaxFlops_int32/MaxFlops_int32.h"
-#include "../lat_int32/lat_int32.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  float flops = max_int32_flops();
-  float latency = int32_latency();
-
-  if (ACCEL_SIM_MODE) {
-    unsigned lat = (unsigned)latency;
-    // divide by 2 as we need FMA thoughput
-    unsigned throughput_per_SM = round_up_2n(flops / 2);
-    float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM;
-
-    unsigned init = WARP_SIZE / throughput_per_sched;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    if (deviceProp.major < 6) { // detecaited integer unit was added since Volta
-      std::cout << "-gpgpu_num_int_units 0" << std::endl;
-    } else {
-      std::cout << "-gpgpu_num_int_units " << WARP_SCHEDS_PER_SM << std::endl;
-      std::cout << "-ptx_opcode_latency_int " << lat << "," << lat << "," << lat
-                << "," << lat << ",21" << std::endl;
-      std::cout << "-ptx_opcode_initiation_int " << init << "," << init << ","
-                << init << "," << init << "," << init << std::endl;
-      std::cout << "-trace_opcode_latency_initiation_int " << lat << "," << init
-                << std::endl;
-    }
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile
deleted file mode 100644
index c53bf55a1..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = config_sfu.cu
-
-EXE = config_sfu
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu
deleted file mode 100644
index ac1b21fd1..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-
-#include "../../../hw_def/hw_def.h"
-#include "../sfu_bw_fsqrt/sfu_bw_fsqrt.h"
-#include "../sfu_lat_fsqrt/sfu_lat_fsqrt.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  float flops = sfu_max_flops();
-  float latency = sfu_latency();
-
-  if (ACCEL_SIM_MODE) {
-    unsigned lat = (unsigned)latency;
-    unsigned throughput_per_SM = round_up_2n(flops);
-    float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM;
-
-    unsigned init = WARP_SIZE / throughput_per_sched;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_num_sfu_units " << WARP_SCHEDS_PER_SM << std::endl;
-    std::cout << "-ptx_opcode_latency_sfu " << lat << std::endl;
-    std::cout << "-ptx_opcode_initiation_sfu " << init << std::endl;
-    std::cout << "-trace_opcode_latency_initiation_sfu " << lat << "," << init
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile
deleted file mode 100644
index d29e58ec8..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-GENCODE_SM50 :=
-GENCODE_SM61 :=
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM60 :=
-GENCODE_SM62 :=
-
-SRC = config_tensor.cu
-
-EXE = config_tensor
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu
deleted file mode 100644
index 647221df5..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "../tensor_bw_half/tensor_bw_half.h"
-#include "../tensor_lat_half/tensor_lat_half.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  // measure the flops and lat based on half operand and float accumlate
-  float flops = tensor_max_flops<half, float>();
-  float latency = tensor_lat<half, float>();
-
-  if (ACCEL_SIM_MODE) {
-    unsigned lat = (unsigned)latency;
-    unsigned throughput_per_SM = round_up_2n(flops);
-    float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM;
-
-    unsigned init = WARP_SIZE / throughput_per_sched;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    if (deviceProp.major < 6) { // tensor core was added since Volta
-      std::cout << "-gpgpu_tensor_core_avail 0" << std::endl;
-      std::cout << "-gpgpu_num_tensor_core_units 0" << std::endl;
-    } else {
-      std::cout << "-gpgpu_tensor_core_avail 1" << std::endl;
-      std::cout << "-gpgpu_num_tensor_core_units " << WARP_SCHEDS_PER_SM
-                << std::endl;
-      std::cout << "-ptx_opcode_latency_tesnor " << lat << std::endl;
-      std::cout << "-ptx_opcode_initiation_tensor " << init << std::endl;
-
-      // trace mode
-      // assume tesnor is on spec unit 3
-      std::cout << "-trace_opcode_latency_initiation_tensor "
-                << lat / SASS_hmma_per_PTX_wmma << ","
-                << init / SASS_hmma_per_PTX_wmma << std::endl;
-      std::cout << "-specialized_unit_3 1," << WARP_SCHEDS_PER_SM << ","
-                << lat / SASS_hmma_per_PTX_wmma << "," << WARP_SCHEDS_PER_SM
-                << "," << WARP_SCHEDS_PER_SM << ",TENSOR" << std::endl;
-      std::cout << "-trace_opcode_latency_initiation_spec_op_3 "
-                << lat / SASS_hmma_per_PTX_wmma << ","
-                << init / SASS_hmma_per_PTX_wmma << std::endl;
-    }
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile
deleted file mode 100644
index 484be447d..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = config_udp.cu
-
-EXE = config_udp
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu
deleted file mode 100644
index 68b7a8162..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  if (ACCEL_SIM_MODE) {
-
-    /* we cannot meaure uniform instrcution for now as they only exist at
-      SASS level not at PTX nor CUDA level, so assume constant latency and BW
-      for now
-
-      dedicated uniform unit was added since Turing SM 7.0
-      */
-    if (deviceProp.major >= 7) {
-      // assume UDP unit is on spec unit 4
-      std::cout << "-specialized_unit_4 1," << WARP_SCHEDS_PER_SM << ",4,"
-                << WARP_SCHEDS_PER_SM << "," << WARP_SCHEDS_PER_SM << ",UDP"
-                << std::endl;
-
-      std::cout << "-trace_opcode_latency_initiation_spec_op_4 4,1"
-                << std::endl;
-    }
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile
deleted file mode 100644
index 8bf0f8857..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = core_config.cu
-
-EXE = core_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu b/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu
deleted file mode 100644
index ea0eb047b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  printf("CUDA version number = %d.%d\n", deviceProp.major, deviceProp.minor);
-
-  if (ACCEL_SIM_MODE) {
-    std::cout << "\n//Accel_Sim config: \n";
-
-    std::cout << "-gpgpu_ptx_force_max_capability " << deviceProp.major
-              << deviceProp.minor << std::endl;
-
-    std::cout << "-gpgpu_shader_registers " << deviceProp.regsPerMultiprocessor
-              << std::endl;
-    std::cout << "-gpgpu_registers_per_block " << deviceProp.regsPerBlock
-              << std::endl;
-    std::cout << "-gpgpu_occupancy_sm_number " << deviceProp.major
-              << deviceProp.minor << std::endl;
-    std::cout << "-gpgpu_coalesce_arch " << deviceProp.major << deviceProp.minor
-              << std::endl;
-
-    unsigned ID_OC_SP, ID_OC_DP, ID_OC_INT, ID_OC_SFU, ID_OC_MEM, OC_EX_SP,
-        OC_EX_DP, OC_EX_INT, OC_EX_SFU, OC_EX_MEM, EX_WB, ID_OC_TENSOR_CORE,
-        OC_EX_TENSOR_CORE;
-    ID_OC_SFU = OC_EX_SFU = WARP_SCHEDS_PER_SM;
-    ID_OC_MEM = OC_EX_MEM = WARP_SCHEDS_PER_SM;
-    ID_OC_SP = OC_EX_SP = WARP_SCHEDS_PER_SM;
-    ID_OC_DP = OC_EX_DP = WARP_SCHEDS_PER_SM;
-    EX_WB = WARP_SCHEDS_PER_SM * 2;
-    if (deviceProp.major < 6) { // no integer or tensor cores before volta
-      ID_OC_INT = OC_EX_INT = 0;
-      ID_OC_TENSOR_CORE = OC_EX_TENSOR_CORE = 0;
-    } else {
-      ID_OC_INT = OC_EX_INT = WARP_SCHEDS_PER_SM;
-      ID_OC_TENSOR_CORE = OC_EX_TENSOR_CORE = WARP_SCHEDS_PER_SM;
-    }
-
-    //#
-    // ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-    std::cout << "-gpgpu_pipeline_widths " << ID_OC_SP << "," << ID_OC_DP << ","
-              << ID_OC_INT << "," << ID_OC_SFU << "," << ID_OC_MEM << ","
-              << OC_EX_SP << "," << OC_EX_DP << "," << OC_EX_INT << ","
-              << OC_EX_SFU << "," << OC_EX_MEM << "," << EX_WB;
-    if (deviceProp.major < 6)
-      std::cout << std::endl;
-    else
-      std::cout << "," << ID_OC_TENSOR_CORE << "," << OC_EX_TENSOR_CORE
-                << std::endl;
-
-    std::cout << "-gpgpu_sub_core_model " << CORE_MODEL << std::endl;
-
-    std::cout << "-gpgpu_enable_specialized_operand_collector 0" << std::endl;
-    std::cout << "-gpgpu_operand_collector_num_units_gen "
-              << WARP_SCHEDS_PER_SM * 2 << std::endl;
-    std::cout << "-gpgpu_operand_collector_num_in_ports_gen "
-              << WARP_SCHEDS_PER_SM * 2 << std::endl;
-    std::cout << "-gpgpu_operand_collector_num_out_ports_gen "
-              << WARP_SCHEDS_PER_SM * 2 << std::endl;
-
-    std::cout << "-gpgpu_num_sched_per_core " << WARP_SCHEDS_PER_SM
-              << std::endl;
-
-    std::cout << "-gpgpu_max_insn_issue_per_warp " << ISSUE_MODEL << std::endl;
-    std::cout << "-gpgpu_dual_issue_diff_exec_units " << (deviceProp.major > 3)
-              << std::endl;
-
-    std::cout << "-gpgpu_inst_fetch_throughput " << WARP_SCHEDS_PER_SM
-              << std::endl;
-
-    std::cout << "-gpgpu_shader_core_pipeline "
-              << deviceProp.maxThreadsPerMultiProcessor << ":"
-              << deviceProp.warpSize << std::endl;
-    std::cout << "-gpgpu_shader_cta "
-              << round_up_2n((unsigned)deviceProp.maxThreadsPerMultiProcessor /
-                             deviceProp.warpSize / 2)
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile
deleted file mode 100644
index 730b2803e..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = lat_double.cu
-
-EXE = lat_double
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu
deleted file mode 100644
index 48f8f31c5..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "lat_double.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  dpu_latency();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h
deleted file mode 100644
index 7a0c29ed1..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef LAT_DOUBLE_DEF_H
-#define LAT_DOUBLE_DEF_H
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 4096
-
-template <class T>
-__global__ void dpu_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                            T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f64 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+d"(result), "+d"(s1), "+d"(s2));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float dpu_latency() {
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  double *data1 = (double *)malloc(TOTAL_THREADS * sizeof(double));
-  double *data2 = (double *)malloc(TOTAL_THREADS * sizeof(double));
-  double *res = (double *)malloc(TOTAL_THREADS * sizeof(double));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  double *data1_g;
-  double *data2_g;
-  double *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (double)i;
-    data2[i] = (double)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(double)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(double)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(double)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyHostToDevice));
-
-  dpu_latency<double><<<1, 1>>>(startClk_g, stopClk_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4));
-  printf("double-precision DPU latency = %f (clk)\n", latency);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return latency;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile
deleted file mode 100644
index 8fffb3eb8..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = lat_float.cu
-
-EXE = lat_float
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu
deleted file mode 100644
index c1b76a79d..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "lat_float.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  fpu_latency();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h
deleted file mode 100644
index d19926244..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef LAT_FLOAT_DEF_H
-#define LAT_FLOAT_DEF_H
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 4096
-
-template <class T>
-__global__ void fpu_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                            T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "fma.rn.f32 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+f"(result), "+f"(s1), "+f"(s2));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float fpu_latency() {
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *data2 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *res = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  float *data1_g;
-  float *data2_g;
-  float *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (float)i;
-    data2[i] = (float)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  fpu_latency<float><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4));
-  printf("float-precision FPU latency = %f (clk)\n", latency);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return latency;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile
deleted file mode 100644
index c8f97daed..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM50 :=
-
-SRC = lat_half.cu
-
-EXE = lat_half
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu
deleted file mode 100644
index 36184a04e..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "lat_half.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  fpu16_latency();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h
deleted file mode 100644
index 8c1a50549..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef LAT_FP16_DEF_H
-#define LAT_FP16_DEF_H
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 4096
-
-__global__ void fpu16_latency(uint32_t *startClk, uint32_t *stopClk,
-                              half *data1, half *data2, half *data3,
-                              half *data4, half *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  half s2 = data2[gid];
-  half s4 = data4[gid];
-  half2 mult = __halves2half2(s2, s4);
-  half result1 = data1[gid];
-  half result2 = data3[gid];
-  half2 result = __halves2half2(result1, result2);
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    result = result * mult + result;
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = __high2half(result) + __low2half(result);
-}
-
-float fpu16_latency() {
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  half *data1 = (half *)malloc(TOTAL_THREADS * sizeof(half));
-  half *data2 = (half *)malloc(TOTAL_THREADS * sizeof(half));
-  half *res = (half *)malloc(TOTAL_THREADS * sizeof(half));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  half *data1_g;
-  half *data2_g;
-  half *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (half)i;
-    data2[i] = (half)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(half)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(half)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(half)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyHostToDevice));
-
-  fpu16_latency<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(half),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES));
-  printf("fpu16 latency = %f (clk)\n", latency);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return latency;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile
deleted file mode 100644
index 311af1ffe..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = lat_int32.cu
-
-EXE = lat_int32
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu
deleted file mode 100644
index bd926104c..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "lat_int32.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  int32_latency();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h
deleted file mode 100644
index cacd3a86a..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef LAT_INT32_DEF_H
-#define LAT_INT32_DEF_H
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-#define REPEAT_TIMES 1024
-
-template <class T>
-__global__ void int32_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
-                              T *data2, T *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register T s1 = data1[gid];
-  register T s2 = data2[gid];
-  register T result = 0;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "mad.lo.s32 %0, %1, %2 , %0;\n\t"
-                 "}"
-                 : "+r"(result), "+r"(s1), "+r"(s2));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float int32_latency() {
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *data2 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-  int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  int32_t *data1_g;
-  int32_t *data2_g;
-  int32_t *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = (int32_t)i;
-    data2[i] = (int32_t)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(int32_t)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyHostToDevice));
-
-  int32_latency<int32_t><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4));
-  printf("int32 latency = %f (clk)\n", latency);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return latency;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile
deleted file mode 100644
index 755f2266d..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-GENCODE_SM50 :=
-GENCODE_SM61 :=
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM60 :=
-GENCODE_SM62 :=
-
-SRC = regfile_bw.cu
-
-EXE = regfile_bw
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu
deleted file mode 100644
index 723e01f4b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "../MaxFlops_float/MaxFlops_float.h"
-#include "../tensor_bw_half/tensor_bw_half.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  unsigned regfile_bw;
-  /* we measure the reg file BW based on the most demanding data instruction,
-   i.e. tensor cores. See slide 20 from Nvidia for more details at
-   https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-  */
-  if (deviceProp.major >= 6) { // tesnor core unit was added since Volta
-    float fma_bw = tensor_max_flops<half, float>(true);
-
-    unsigned tensor_MACs_per_SM = round_up_2n(fma_bw);
-
-    /*
-    two operands needs per MAC each cycle (A, B), C will be saved at the tensor
-    core accuamlte register
-    */
-    regfile_bw = tensor_MACs_per_SM * sizeof(half) * 2;
-  } else {
-    // if less than volta calculate based on FP32 FMA
-    float flops = fpu_max_flops();
-
-    // divide by 2 as we need FMA throughput
-    unsigned FMA_throughput_per_SM = round_up_2n(flops / 2);
-
-    // three operands needs per FMA each cycle (A, B, C)
-    regfile_bw = round_up_2n((float)FMA_throughput_per_SM * sizeof(float) * 3);
-  }
-  std::cout << "\nregfile_bw = " << regfile_bw << " (byte/SM)" << std::endl;
-
-  if (ACCEL_SIM_MODE) {
-
-    unsigned reg_ports;
-    // Nvidia starts to have dual port register file since volta
-    if (deviceProp.major < 6)
-      reg_ports = 1;
-    else
-      reg_ports = 2;
-
-    // WARP_SIZE*4 bytes, as registers are 32-bit width
-    unsigned banks_num = regfile_bw / (WARP_SIZE * 4) / reg_ports;
-
-    /* we multiply by two as accel-sim does not model register file cache (added
-     since kepler) so to mitigate, the reg file bw comes from RFC, we
-     conservatively multiply the banks by 2 (to fix)
-     */
-    if (deviceProp.major > 3)
-      banks_num = banks_num * 2;
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_num_reg_banks " << banks_num << std::endl;
-    std::cout << "-gpgpu_reg_file_port_throughput " << reg_ports << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile
deleted file mode 100644
index d5aa2f3f0..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = sfu_bw_fsqrt.cu
-
-EXE = sfu_bw_fsqrt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu
deleted file mode 100644
index 023799f04..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "sfu_bw_fsqrt.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  sfu_max_flops();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h
deleted file mode 100644
index 922c72dff..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef MAXFLOPS_SFU_DEF_H
-#define MAXFLOPS_SFU_DEF_H
-
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-__global__ void max_flops(uint64_t *startClk, uint64_t *stopClk, float *data1,
-                          float *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register float s1 = data1[gid];
-  register float result = s1;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "sqrt.approx.ftz.f32 %0, %0;\n\t"
-                 "sqrt.approx.ftz.f32 %0, %0;\n\t"
-                 "sqrt.approx.ftz.f32 %0, %0;\n\t"
-                 "sqrt.approx.ftz.f32 %0, %0;\n\t"
-                 "}"
-                 : "+f"(result));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float sfu_max_flops() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *res = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  float *data1_g;
-  float *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = 987654321.789456 + (float)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  max_flops<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, data1_g,
-                                               res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float flops;
-  flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 4) /
-          ((float)(stopClk[0] - startClk[0]));
-  std::cout << "SFU fast sqrt bw = " << flops << "(flops/clk/SM) \n";
-  std::cout << "Total Clk number = " << (stopClk[0] - startClk[0]) << "\n";
-
-  return flops;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile
deleted file mode 100644
index eff045c24..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = sfu_lat_fsqrt.cu
-
-EXE = sfu_lat_fsqrt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu
deleted file mode 100644
index a9fed49ec..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "sfu_lat_fsqrt.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  sfu_latency();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h
deleted file mode 100644
index df004f3f2..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef LAT_SFU_DEF_H
-#define LAT_SFU_DEF_H
-
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 1024
-
-__global__ void sfu_latency(uint64_t *startClk, uint64_t *stopClk, float *data1,
-                            float *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  register float s1 = data1[gid];
-  register float result = s1;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    asm volatile("{\t\n"
-                 "sin.approx.ftz.f32 %0, %0;\n\t"
-                 "sin.approx.ftz.f32 %0, %0;\n\t"
-                 "sin.approx.ftz.f32 %0, %0;\n\t"
-                 "sin.approx.ftz.f32 %0, %0;\n\t"
-                 "}"
-                 : "+f"(result));
-  }
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-  res[gid] = result;
-}
-
-float sfu_latency() {
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float));
-  float *res = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  float *data1_g;
-  float *res_g;
-
-  for (uint32_t i = 0; i < TOTAL_THREADS; i++) {
-    data1[i] = 10.124234521;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  sfu_latency<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, data1_g,
-                                                 res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float latency;
-  latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4));
-  std::cout << "SFU fast sqrt latency = " << latency << "(clk) \n";
-  std::cout << "Total Clk number = " << (stopClk[0] - startClk[0]) << "\n";
-
-  return latency;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile
deleted file mode 100644
index c55e26c36..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-GENCODE_SM50 :=
-GENCODE_SM61 :=
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM60 :=
-GENCODE_SM62 :=
-
-SRC = tensor_bw_half.cu
-
-EXE = tensor_bw_half
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt
deleted file mode 100644
index 53977dfec..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt
+++ /dev/null
@@ -1,632 +0,0 @@
-
-Fatbin elf code:
-================
-arch = sm_70
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_70
-
-Fatbin elf code:
-================
-arch = sm_75
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_75
-
-Fatbin elf code:
-================
-arch = sm_80
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_80
-
-Fatbin elf code:
-================
-arch = sm_86
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_86
-
-Fatbin elf code:
-================
-arch = sm_70
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_70
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j
-	.headerflags    @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
-        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;                          /* 0x00000a00ff017624 */
-                                                                                                    /* 0x000fd000078e00ff */
-        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;                                    /* 0x000000fffffff389 */
-                                                                                                    /* 0x000fe200000e00ff */
-        /*0020*/                   S2R R5, SR_LANEID ;                                              /* 0x0000000000057919 */
-                                                                                                    /* 0x000e220000000000 */
-        /*0030*/                   MOV R26, 0x2 ;                                                   /* 0x00000002001a7802 */
-                                                                                                    /* 0x000fe20000000f00 */
-        /*0040*/                   IMAD.MOV.U32 R22, RZ, RZ, 0x10 ;                                 /* 0x00000010ff167424 */
-                                                                                                    /* 0x000fe400078e00ff */
-        /*0050*/                   S2R R0, SR_TID.X ;                                               /* 0x0000000000007919 */
-                                                                                                    /* 0x000e680000002100 */
-        /*0060*/                   S2R R3, SR_CTAID.X ;                                             /* 0x0000000000037919 */
-                                                                                                    /* 0x000e620000002500 */
-        /*0070*/                   SHF.R.U32.HI R2, RZ, 0x2, R5.reuse ;                             /* 0x00000002ff027819 */
-                                                                                                    /* 0x101fe40000011605 */
-        /*0080*/                   SHF.R.U32.HI R4, RZ, 0x4, R5 ;                                   /* 0x00000004ff047819 */
-                                                                                                    /* 0x000fc40000011605 */
-        /*0090*/                   LOP3.LUT R2, R2, 0x3, RZ, 0xc0, !PT ;                            /* 0x0000000302027812 */
-                                                                                                    /* 0x000fe400078ec0ff */
-        /*00a0*/                   LOP3.LUT R6, R4, 0x1, RZ, 0xc0, !PT ;                            /* 0x0000000104067812 */
-                                                                                                    /* 0x000fe200078ec0ff */
-        /*00b0*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                                   /* 0x0000000003007a24 */
-                                                                                                    /* 0x002fe200078e0200 */
-        /*00c0*/                   LOP3.LUT R3, R5, 0x3, RZ, 0xc0, !PT ;                            /* 0x0000000305037812 */
-                                                                                                    /* 0x000fe200078ec0ff */
-        /*00d0*/                   IMAD.SHL.U32 R8, R2, 0x8, RZ ;                                   /* 0x0000000802087824 */
-                                                                                                    /* 0x000fe200078e00ff */
-        /*00e0*/                   SHF.R.U32.HI R2, RZ, 0x1, R2 ;                                   /* 0x00000001ff027819 */
-                                                                                                    /* 0x000fe40000011602 */
-        /*00f0*/                   SHF.L.U32 R27, R0, 0xc, RZ ;                                     /* 0x0000000c001b7819 */
-                                                                                                    /* 0x000fe400000006ff */
-        /*0100*/                   LOP3.LUT R5, R8, 0x8, R3.reuse, 0xe2, !PT ;                      /* 0x0000000808057812 */
-                                                                                                    /* 0x100fe200078ee203 */
-        /*0110*/                   IMAD R7, R2, 0x8, R3 ;                                           /* 0x0000000802077824 */
-                                                                                                    /* 0x000fc600078e0203 */
-        /*0120*/                   LEA R3, R6.reuse, R5, 0x2 ;                                      /* 0x0000000506037211 */
-                                                                                                    /* 0x040fe200078e10ff */
-        /*0130*/                   IMAD R7, R6, 0x4, R7 ;                                           /* 0x0000000406077824 */
-                                                                                                    /* 0x000fe400078e0207 */
-        /*0140*/                   IMAD.WIDE.U32 R4, R27, R26, c[0x0][0x170] ;                      /* 0x00005c001b047625 */
-                                                                                                    /* 0x000fe200078e001a */
-        /*0150*/                   SHF.L.U32 R21, R3, 0x1, RZ ;                                     /* 0x0000000103157819 */
-                                                                                                    /* 0x000fe400000006ff */
-        /*0160*/                   SHF.L.U32 R7, R7, 0x1, RZ ;                                      /* 0x0000000107077819 */
-                                                                                                    /* 0x000fca00000006ff */
-        /*0170*/                   IMAD.WIDE.U32 R20, R21, 0x10, R4 ;                               /* 0x0000001015147825 */
-                                                                                                    /* 0x000fc800078e0004 */
-        /*0180*/                   IMAD.WIDE.U32 R22, R7, R22, c[0x0][0x178] ;                      /* 0x00005e0007167625 */
-                                                                                                    /* 0x000fc800078e0016 */
-        /*0190*/                   LDG.E.128.SYS R12, [R20] ;                                       /* 0x00000000140c7381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01a0*/                   LDG.E.128.SYS R4, [R20+0x10] ;                                   /* 0x0000100014047381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01b0*/                   LDG.E.128.SYS R16, [R22] ;                                       /* 0x0000000016107381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01c0*/                   LDG.E.128.SYS R8, [R22+0x10] ;                                   /* 0x0000100016087381 */
-                                                                                                    /* 0x00012200001eed00 */
-        /*01d0*/                   IMAD.WIDE.U32 R26, R27, R26, c[0x0][0x180] ;                     /* 0x000060001b1a7625 */
-                                                                                                    /* 0x000fc600078e001a */
-        /*01e0*/                   NOP ;                                                            /* 0x0000000000007918 */
-                                                                                                    /* 0x000fe20000000000 */
-        /*01f0*/                   BAR.SYNC 0x0 ;                                                   /* 0x0000000000007b1d */
-                                                                                                    /* 0x000fea0000000000 */
-        /*0200*/                   CS2R R24, SR_CLOCKLO ;                                           /* 0x0000000000187805 */
-                                                                                                    /* 0x000fd00000015000 */
-        /*0210*/                   NOP ;                                                            /* 0x0000000000007918 */
-                                                                                                    /* 0x000fe20000000000 */
-        /*0220*/                   BAR.SYNC 0x0 ;                                                   /* 0x0000000000007b1d */
-                                                                                                    /* 0x000fea0000000000 */
-        /*0230*/                   CS2R R20, SRZ ;                                                  /* 0x0000000000147805 */
-                                                                                                    /* 0x001fe2000001ff00 */
-        /*0240*/                   CS2R R22, SRZ ;                                                  /* 0x0000000000167805 */
-                                                                                                    /* 0x000fca000001ff00 */
-        /*0250*/                   HMMA.884.F16.F16.STEP0 R20, R12.reuse.ROW, R16.reuse.COL, R20 ;  /* 0x000000100c147236 */
-                                                                                                    /* 0x0d0fe80000000414 */
-        /*0260*/                   HMMA.884.F16.F16.STEP1 R22, R12.ROW, R16.COL, R22 ;              /* 0x000000100c167236 */
-                                                                                                    /* 0x000f680000008416 */
-        /*0270*/                   HMMA.884.F16.F16.STEP0 R20, R14.reuse.ROW, R18.reuse.COL, R20 ;  /* 0x000000120e147236 */
-                                                                                                    /* 0x0e0fe80000000414 */
-        /*0280*/                   HMMA.884.F16.F16.STEP1 R22, R14.ROW, R18.COL, R22 ;              /* 0x000000120e167236 */
-                                                                                                    /* 0x000f680000008416 */
-        /*0290*/                   HMMA.884.F16.F16.STEP0 R20, R4.reuse.ROW, R8.reuse.COL, R20 ;    /* 0x0000000804147236 */
-                                                                                                    /* 0x0e0fe80000000414 */
-        /*02a0*/                   HMMA.884.F16.F16.STEP1 R22, R4.ROW, R8.COL, R22 ;                /* 0x0000000804167236 */
-                                                                                                    /* 0x000f680000008416 */
-        /*02b0*/                   HMMA.884.F16.F16.STEP0 R20, R6.reuse.ROW, R10.reuse.COL, R20 ;   /* 0x0000000a06147236 */
-                                                                                                    /* 0x0e0b680000000414 */
-        /*02c0*/                   HMMA.884.F16.F16.STEP1 R22, R6.ROW, R10.COL, R22 ;               /* 0x0000000a06167236 */
-                                                                                                    /* 0x000b5a0000008416 */
-        /*02d0*/                   CS2R R8, SR_CLOCKLO ;                                            /* 0x0000000000087805 */
-                                                                                                    /* 0x000fd00000015000 */
-        /*02e0*/                   MOV R5, RZ ;                                                     /* 0x000000ff00057202 */
-                                                                                                    /* 0x000fe20000000f00 */
-        /*02f0*/                   IMAD.SHL.U32 R4, R2, 0x8, RZ ;                                   /* 0x0000000802047824 */
-                                                                                                    /* 0x000fc400078e00ff */
-        /*0300*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x8 ;                                   /* 0x00000008ff077424 */
-                                                                                                    /* 0x020fe400078e00ff */
-        /*0310*/                   IMAD.WIDE.U32 R4, R3, 0x10, R4 ;                                 /* 0x0000001003047825 */
-                                                                                                    /* 0x000fc800078e0004 */
-        /*0320*/                   IMAD.WIDE.U32 R2, R0, R7, c[0x0][0x160] ;                        /* 0x0000580000027625 */
-                                                                                                    /* 0x000fc600078e0007 */
-        /*0330*/                   LEA R10, P0, R4, R26, 0x1 ;                                      /* 0x0000001a040a7211 */
-                                                                                                    /* 0x000fe200078008ff */
-        /*0340*/                   IMAD.WIDE.U32 R6, R0, R7, c[0x0][0x168] ;                        /* 0x00005a0000067625 */
-                                                                                                    /* 0x000fc600078e0007 */
-        /*0350*/                   LEA.HI.X R11, R4, R27, R5, 0x1, P0 ;                             /* 0x0000001b040b7211 */
-                                                                                                    /* 0x000fd000000f0c05 */
-        /*0360*/                   STG.E.128.SYS [R10], R20 ;                                       /* 0x000000140a007386 */
-                                                                                                    /* 0x000fe8000010ed00 */
-        /*0370*/                   STG.E.64.SYS [R2], R24 ;                                         /* 0x0000001802007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*0380*/                   STG.E.64.SYS [R6], R8 ;                                          /* 0x0000000806007386 */
-                                                                                                    /* 0x000fe2000010eb00 */
-        /*0390*/                   EXIT ;                                                           /* 0x000000000000794d */
-                                                                                                    /* 0x000fea0003800000 */
-        /*03a0*/                   BRA 0x3a0;                                                       /* 0xfffffff000007947 */
-                                                                                                    /* 0x000fc0000383ffff */
-        /*03b0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-        /*03c0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-        /*03d0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-        /*03e0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-        /*03f0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-		..........
-
-
-
-Fatbin ptx code:
-================
-arch = sm_70
-code version = [7,1]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
-
-Fatbin elf code:
-================
-arch = sm_75
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_75
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j
-	.headerflags    @"EF_CUDA_SM75 EF_CUDA_PTX_SM(EF_CUDA_SM75)"
-        /*0000*/                   MOV R1, c[0x0][0x28] ;                                      /* 0x00000a0000017a02 */
-                                                                                               /* 0x000fd00000000f00 */
-        /*0010*/                   S2R R0, SR_TID.X ;                                          /* 0x0000000000007919 */
-                                                                                               /* 0x000e220000002100 */
-        /*0020*/                   MOV R9, 0x2 ;                                               /* 0x0000000200097802 */
-                                                                                               /* 0x000fe20000000f00 */
-        /*0030*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x10 ;                             /* 0x00000010ff077424 */
-                                                                                               /* 0x000fe400078e00ff */
-        /*0040*/                   S2R R3, SR_CTAID.X ;                                        /* 0x0000000000037919 */
-                                                                                               /* 0x000e280000002500 */
-        /*0050*/                   S2R R5, SR_LANEID ;                                         /* 0x0000000000057919 */
-                                                                                               /* 0x000e620000000000 */
-        /*0060*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                              /* 0x0000000003007a24 */
-                                                                                               /* 0x001fe200078e0200 */
-        /*0070*/                   MOV R3, RZ ;                                                /* 0x000000ff00037202 */
-                                                                                               /* 0x000fc40000000f00 */
-        /*0080*/                   LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ;                       /* 0x0000000305027812 */
-                                                                                               /* 0x002fe200078ec0ff */
-        /*0090*/                   IMAD.SHL.U32 R8, R0, 0x1000, RZ ;                           /* 0x0000100000087824 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*00a0*/                   SHF.R.U32.HI R5, RZ, 0x2, R5 ;                              /* 0x00000002ff057819 */
-                                                                                               /* 0x000fca0000011605 */
-        /*00b0*/                   IMAD.WIDE.U32 R4, R5, 0x8, R2 ;                             /* 0x0000000805047825 */
-                                                                                               /* 0x000fc800078e0002 */
-        /*00c0*/                   IMAD.WIDE.U32 R2, R8, R9, c[0x0][0x170] ;                   /* 0x00005c0008027625 */
-                                                                                               /* 0x000fc600078e0009 */
-        /*00d0*/                   LEA R10, P1, R4, c[0x0][0x178], 0x2 ;                       /* 0x00005e00040a7a11 */
-                                                                                               /* 0x000fc800078210ff */
-        /*00e0*/                   LEA R2, P0, R4.reuse, R2, 0x2 ;                             /* 0x0000000204027211 */
-                                                                                               /* 0x040fe400078010ff */
-        /*00f0*/                   LEA.HI.X R11, R4.reuse, c[0x0][0x17c], R5.reuse, 0x2, P1 ;  /* 0x00005f00040b7a11 */
-                                                                                               /* 0x140fe400008f1405 */
-        /*0100*/                   LEA.HI.X R3, R4, R3, R5, 0x2, P0 ;                          /* 0x0000000304037211 */
-                                                                                               /* 0x000fc600000f1405 */
-        /*0110*/                   IMAD.WIDE.U32 R12, R7, 0x10, R10 ;                          /* 0x00000010070c7825 */
-                                                                                               /* 0x000fc600078e000a */
-        /*0120*/                   LDG.E.SYS R18, [R10] ;                                      /* 0x000000000a127381 */
-                                                                                               /* 0x00012200001ee900 */
-        /*0130*/                   IMAD.WIDE.U32 R6, R7, 0x10, R2 ;                            /* 0x0000001007067825 */
-                                                                                               /* 0x000fc600078e0002 */
-        /*0140*/                   LDG.E.SYS R19, [R10+0x10] ;                                 /* 0x000010000a137381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*0150*/                   LDG.E.SYS R20, [R12] ;                                      /* 0x000000000c147381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*0160*/                   LDG.E.SYS R21, [R12+0x10] ;                                 /* 0x000010000c157381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*0170*/                   LDG.E.SYS R14, [R2] ;                                       /* 0x00000000020e7381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*0180*/                   LDG.E.SYS R16, [R2+0x10] ;                                  /* 0x0000100002107381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*0190*/                   LDG.E.SYS R15, [R6] ;                                       /* 0x00000000060f7381 */
-                                                                                               /* 0x00012800001ee900 */
-        /*01a0*/                   LDG.E.SYS R17, [R6+0x10] ;                                  /* 0x0000100006117381 */
-                                                                                               /* 0x00012200001ee900 */
-        /*01b0*/                   IMAD.WIDE.U32 R8, R8, R9, c[0x0][0x180] ;                   /* 0x0000600008087625 */
-                                                                                               /* 0x000fc600078e0009 */
-        /*01c0*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fea0000000000 */
-        /*01d0*/                   CS2R R10, SR_CLOCKLO ;                                      /* 0x00000000000a7805 */
-                                                                                               /* 0x001fd00000015000 */
-        /*01e0*/                   CS2R R2, SRZ ;                                              /* 0x0000000000027805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*01f0*/                   CS2R R6, SRZ ;                                              /* 0x0000000000067805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*0200*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fea0000000000 */
-        /*0210*/                   HMMA.1688.F16 R2, R14, R18, R2 ;                            /* 0x000000120e02723c */
-                                                                                               /* 0x010f700000000002 */
-        /*0220*/                   HMMA.1688.F16 R6, R14, R20, R6 ;                            /* 0x000000140e06723c */
-                                                                                               /* 0x000f700000000006 */
-        /*0230*/                   HMMA.1688.F16 R18, R16, R19, R2 ;                           /* 0x000000131012723c */
-                                                                                               /* 0x020b700000000002 */
-        /*0240*/                   HMMA.1688.F16 R20, R16, R21, R6 ;                           /* 0x000000151014723c */
-                                                                                               /* 0x000b5c0000000006 */
-        /*0250*/                   CS2R R12, SR_CLOCKLO ;                                      /* 0x00000000000c7805 */
-                                                                                               /* 0x000fd00000015000 */
-        /*0260*/                   LEA R2, P0, R4, R8, 0x2 ;                                   /* 0x0000000804027211 */
-                                                                                               /* 0x020fc800078010ff */
-        /*0270*/                   LEA.HI.X R3, R4, R9, R5, 0x2, P0 ;                          /* 0x0000000904037211 */
-                                                                                               /* 0x000fe200000f1405 */
-        /*0280*/                   IMAD.MOV.U32 R9, RZ, RZ, 0x8 ;                              /* 0x00000008ff097424 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*0290*/                   MOV R5, 0x20 ;                                              /* 0x0000002000057802 */
-                                                                                               /* 0x000fc60000000f00 */
-        /*02a0*/                   IMAD.WIDE.U32 R6, R0, R9, c[0x0][0x160] ;                   /* 0x0000580000067625 */
-                                                                                               /* 0x000fc600078e0009 */
-        /*02b0*/                   STG.E.SYS [R2], R18 ;                                       /* 0x0000001202007386 */
-                                                                                               /* 0x000fe2000010e900 */
-        /*02c0*/                   IMAD.WIDE.U32 R4, R5, 0x8, R2 ;                             /* 0x0000000805047825 */
-                                                                                               /* 0x000fc800078e0002 */
-        /*02d0*/                   IMAD.WIDE.U32 R8, R0, R9, c[0x0][0x168] ;                   /* 0x00005a0000087625 */
-                                                                                               /* 0x000fc800078e0009 */
-        /*02e0*/                   STG.E.SYS [R4], R19 ;                                       /* 0x0000001304007386 */
-                                                                                               /* 0x000fe8000010e900 */
-        /*02f0*/                   STG.E.SYS [R2+0x10], R20 ;                                  /* 0x0000101402007386 */
-                                                                                               /* 0x000fe8000010e900 */
-        /*0300*/                   STG.E.SYS [R4+0x10], R21 ;                                  /* 0x0000101504007386 */
-                                                                                               /* 0x000fe8000010e900 */
-        /*0310*/                   STG.E.64.SYS [R6], R10 ;                                    /* 0x0000000a06007386 */
-                                                                                               /* 0x000fe8000010eb00 */
-        /*0320*/                   STG.E.64.SYS [R8], R12 ;                                    /* 0x0000000c08007386 */
-                                                                                               /* 0x000fe2000010eb00 */
-        /*0330*/                   EXIT ;                                                      /* 0x000000000000794d */
-                                                                                               /* 0x000fea0003800000 */
-        /*0340*/                   BRA 0x340;                                                  /* 0xfffffff000007947 */
-                                                                                               /* 0x000fc0000383ffff */
-        /*0350*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0360*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0370*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-		..........
-
-
-
-Fatbin ptx code:
-================
-arch = sm_75
-code version = [7,1]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
-
-Fatbin elf code:
-================
-arch = sm_80
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_80
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j
-	.headerflags    @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)"
-        /*0000*/                   MOV R1, c[0x0][0x28] ;                                      /* 0x00000a0000017a02 */
-                                                                                               /* 0x000fce0000000f00 */
-        /*0010*/                   S2R R0, SR_TID.X ;                                          /* 0x0000000000007919 */
-                                                                                               /* 0x000e220000002100 */
-        /*0020*/                   MOV R17, 0x2 ;                                              /* 0x0000000200117802 */
-                                                                                               /* 0x000fe20000000f00 */
-        /*0030*/                   IMAD.MOV.U32 R15, RZ, RZ, 0x10 ;                            /* 0x00000010ff0f7424 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;                                /* 0x0000460000047ab9 */
-                                                                                               /* 0x000fe20000000a00 */
-        /*0050*/                   S2R R3, SR_CTAID.X ;                                        /* 0x0000000000037919 */
-                                                                                               /* 0x000e280000002500 */
-        /*0060*/                   S2R R5, SR_LANEID ;                                         /* 0x0000000000057919 */
-                                                                                               /* 0x000e620000000000 */
-        /*0070*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                              /* 0x0000000003007a24 */
-                                                                                               /* 0x001fe200078e0200 */
-        /*0080*/                   MOV R3, RZ ;                                                /* 0x000000ff00037202 */
-                                                                                               /* 0x000fc40000000f00 */
-        /*0090*/                   LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ;                       /* 0x0000000305027812 */
-                                                                                               /* 0x002fe200078ec0ff */
-        /*00a0*/                   IMAD.SHL.U32 R16, R0, 0x1000, RZ ;                          /* 0x0000100000107824 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*00b0*/                   SHF.R.U32.HI R5, RZ, 0x2, R5 ;                              /* 0x00000002ff057819 */
-                                                                                               /* 0x000fca0000011605 */
-        /*00c0*/                   IMAD.WIDE.U32 R2, R5, 0x8, R2 ;                             /* 0x0000000805027825 */
-                                                                                               /* 0x000fc800078e0002 */
-        /*00d0*/                   IMAD.WIDE.U32 R4, R16, R17, c[0x0][0x170] ;                 /* 0x00005c0010047625 */
-                                                                                               /* 0x000fe200078e0011 */
-        /*00e0*/                   LEA R18, P1, R2, c[0x0][0x178], 0x2 ;                       /* 0x00005e0002127a11 */
-                                                                                               /* 0x000fc800078210ff */
-        /*00f0*/                   LEA R8, P0, R2.reuse, R4, 0x2 ;                             /* 0x0000000402087211 */
-                                                                                               /* 0x040fe400078010ff */
-        /*0100*/                   LEA.HI.X R19, R2.reuse, c[0x0][0x17c], R3.reuse, 0x2, P1 ;  /* 0x00005f0002137a11 */
-                                                                                               /* 0x140fe400008f1403 */
-        /*0110*/                   LEA.HI.X R9, R2, R5, R3, 0x2, P0 ;                          /* 0x0000000502097211 */
-                                                                                               /* 0x000fc600000f1403 */
-        /*0120*/                   IMAD.WIDE.U32 R20, R15.reuse, 0x10, R18 ;                   /* 0x000000100f147825 */
-                                                                                               /* 0x040fe200078e0012 */
-        /*0130*/                   LDG.E R10, [R18.64] ;                                       /* 0x00000004120a7981 */
-                                                                                               /* 0x000126000c1e1900 */
-        /*0140*/                   IMAD.WIDE.U32 R14, R15, 0x10, R8 ;                          /* 0x000000100f0e7825 */
-                                                                                               /* 0x000fe200078e0008 */
-        /*0150*/                   LDG.E R11, [R18.64+0x10] ;                                  /* 0x00001004120b7981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0160*/                   LDG.E R12, [R20.64] ;                                       /* 0x00000004140c7981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0170*/                   LDG.E R13, [R20.64+0x10] ;                                  /* 0x00001004140d7981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0180*/                   LDG.E R4, [R8.64] ;                                         /* 0x0000000408047981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0190*/                   LDG.E R6, [R8.64+0x10] ;                                    /* 0x0000100408067981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*01a0*/                   LDG.E R5, [R14.64] ;                                        /* 0x000000040e057981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*01b0*/                   LDG.E R7, [R14.64+0x10] ;                                   /* 0x000010040e077981 */
-                                                                                               /* 0x000122000c1e1900 */
-        /*01c0*/                   IMAD.WIDE.U32 R16, R16, R17, c[0x0][0x180] ;                /* 0x0000600010107625 */
-                                                                                               /* 0x000fc600078e0011 */
-        /*01d0*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fec0000000000 */
-        /*01e0*/                   CS2R R8, SR_CLOCKLO ;                                       /* 0x0000000000087805 */
-                                                                                               /* 0x001fce0000015000 */
-        /*01f0*/                   CS2R R14, SRZ ;                                             /* 0x00000000000e7805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*0200*/                   CS2R R18, SRZ ;                                             /* 0x0000000000127805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*0210*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fec0000000000 */
-        /*0220*/                   HMMA.16816.F16 R10, R4.reuse, R10, R14 ;                    /* 0x0000000a040a723c */
-                                                                                               /* 0x050b70000000080e */
-        /*0230*/                   HMMA.16816.F16 R18, R4, R12, R18 ;                          /* 0x0000000c0412723c */
-                                                                                               /* 0x000b5e0000000812 */
-        /*0240*/                   NOP ;                                                       /* 0x0000000000007918 */
-                                                                                               /* 0x000fd20000000000 */
-        /*0250*/                   CS2R R14, SR_CLOCKLO ;                                      /* 0x00000000000e7805 */
-                                                                                               /* 0x020fce0000015000 */
-        /*0260*/                   LEA R4, P0, R2, R16, 0x2 ;                                  /* 0x0000001002047211 */
-                                                                                               /* 0x000fe200078010ff */
-        /*0270*/                   IMAD.MOV.U32 R13, RZ, RZ, 0x8 ;                             /* 0x00000008ff0d7424 */
-                                                                                               /* 0x000fc600078e00ff */
-        /*0280*/                   LEA.HI.X R5, R2, R17, R3, 0x2, P0 ;                         /* 0x0000001102057211 */
-                                                                                               /* 0x000fe200000f1403 */
-        /*0290*/                   IMAD.WIDE.U32 R6, R0, R13, c[0x0][0x160] ;                  /* 0x0000580000067625 */
-                                                                                               /* 0x000fe200078e000d */
-        /*02a0*/                   MOV R3, 0x20 ;                                              /* 0x0000002000037802 */
-                                                                                               /* 0x000fc60000000f00 */
-        /*02b0*/                   STG.E [R4.64], R10 ;                                        /* 0x0000000a04007986 */
-                                                                                               /* 0x000fe2000c101904 */
-        /*02c0*/                   IMAD.WIDE.U32 R12, R0, R13, c[0x0][0x168] ;                 /* 0x00005a00000c7625 */
-                                                                                               /* 0x000fc800078e000d */
-        /*02d0*/                   IMAD.WIDE.U32 R2, R3, 0x8, R4 ;                             /* 0x0000000803027825 */
-                                                                                               /* 0x000fca00078e0004 */
-        /*02e0*/                   STG.E [R2.64], R11 ;                                        /* 0x0000000b02007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*02f0*/                   STG.E [R4.64+0x10], R18 ;                                   /* 0x0000101204007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*0300*/                   STG.E [R2.64+0x10], R19 ;                                   /* 0x0000101302007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*0310*/                   STG.E.64 [R6.64], R8 ;                                      /* 0x0000000806007986 */
-                                                                                               /* 0x000fe8000c101b04 */
-        /*0320*/                   STG.E.64 [R12.64], R14 ;                                    /* 0x0000000e0c007986 */
-                                                                                               /* 0x000fe2000c101b04 */
-        /*0330*/                   EXIT ;                                                      /* 0x000000000000794d */
-                                                                                               /* 0x000fea0003800000 */
-        /*0340*/                   BRA 0x340;                                                  /* 0xfffffff000007947 */
-                                                                                               /* 0x000fc0000383ffff */
-        /*0350*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0360*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0370*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0380*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0390*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03a0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03b0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03c0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03d0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03e0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03f0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-		..........
-
-
-
-Fatbin ptx code:
-================
-arch = sm_80
-code version = [7,1]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
-
-Fatbin elf code:
-================
-arch = sm_86
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_86
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j
-	.headerflags    @"EF_CUDA_SM86 EF_CUDA_PTX_SM(EF_CUDA_SM86)"
-        /*0000*/                   MOV R1, c[0x0][0x28] ;                                      /* 0x00000a0000017a02 */
-                                                                                               /* 0x000fce0000000f00 */
-        /*0010*/                   S2R R0, SR_TID.X ;                                          /* 0x0000000000007919 */
-                                                                                               /* 0x000e220000002100 */
-        /*0020*/                   MOV R13, 0x2 ;                                              /* 0x00000002000d7802 */
-                                                                                               /* 0x000fe20000000f00 */
-        /*0030*/                   IMAD.MOV.U32 R11, RZ, RZ, 0x10 ;                            /* 0x00000010ff0b7424 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;                                /* 0x0000460000047ab9 */
-                                                                                               /* 0x000fe20000000a00 */
-        /*0050*/                   S2R R3, SR_CTAID.X ;                                        /* 0x0000000000037919 */
-                                                                                               /* 0x000e280000002500 */
-        /*0060*/                   S2R R5, SR_LANEID ;                                         /* 0x0000000000057919 */
-                                                                                               /* 0x000e620000000000 */
-        /*0070*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                              /* 0x0000000003007a24 */
-                                                                                               /* 0x001fe200078e0200 */
-        /*0080*/                   MOV R3, RZ ;                                                /* 0x000000ff00037202 */
-                                                                                               /* 0x000fc40000000f00 */
-        /*0090*/                   LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ;                       /* 0x0000000305027812 */
-                                                                                               /* 0x002fe200078ec0ff */
-        /*00a0*/                   IMAD.SHL.U32 R12, R0, 0x1000, RZ ;                          /* 0x00001000000c7824 */
-                                                                                               /* 0x000fe200078e00ff */
-        /*00b0*/                   SHF.R.U32.HI R5, RZ, 0x2, R5 ;                              /* 0x00000002ff057819 */
-                                                                                               /* 0x000fca0000011605 */
-        /*00c0*/                   IMAD.WIDE.U32 R2, R5, 0x8, R2 ;                             /* 0x0000000805027825 */
-                                                                                               /* 0x000fc800078e0002 */
-        /*00d0*/                   IMAD.WIDE.U32 R4, R12, R13, c[0x0][0x170] ;                 /* 0x00005c000c047625 */
-                                                                                               /* 0x000fe200078e000d */
-        /*00e0*/                   LEA R14, P1, R2, c[0x0][0x178], 0x2 ;                       /* 0x00005e00020e7a11 */
-                                                                                               /* 0x000fc800078210ff */
-        /*00f0*/                   LEA R8, P0, R2.reuse, R4, 0x2 ;                             /* 0x0000000402087211 */
-                                                                                               /* 0x040fe400078010ff */
-        /*0100*/                   LEA.HI.X R15, R2.reuse, c[0x0][0x17c], R3.reuse, 0x2, P1 ;  /* 0x00005f00020f7a11 */
-                                                                                               /* 0x140fe400008f1403 */
-        /*0110*/                   LEA.HI.X R9, R2, R5, R3, 0x2, P0 ;                          /* 0x0000000502097211 */
-                                                                                               /* 0x000fc600000f1403 */
-        /*0120*/                   IMAD.WIDE.U32 R16, R11.reuse, 0x10, R14 ;                   /* 0x000000100b107825 */
-                                                                                               /* 0x040fe200078e000e */
-        /*0130*/                   LDG.E R18, [R14.64] ;                                       /* 0x000000040e127981 */
-                                                                                               /* 0x000126000c1e1900 */
-        /*0140*/                   IMAD.WIDE.U32 R10, R11, 0x10, R8 ;                          /* 0x000000100b0a7825 */
-                                                                                               /* 0x000fe200078e0008 */
-        /*0150*/                   LDG.E R19, [R14.64+0x10] ;                                  /* 0x000010040e137981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0160*/                   LDG.E R20, [R16.64] ;                                       /* 0x0000000410147981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0170*/                   LDG.E R21, [R16.64+0x10] ;                                  /* 0x0000100410157981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0180*/                   LDG.E R4, [R8.64] ;                                         /* 0x0000000408047981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*0190*/                   LDG.E R6, [R8.64+0x10] ;                                    /* 0x0000100408067981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*01a0*/                   LDG.E R5, [R10.64] ;                                        /* 0x000000040a057981 */
-                                                                                               /* 0x000128000c1e1900 */
-        /*01b0*/                   LDG.E R7, [R10.64+0x10] ;                                   /* 0x000010040a077981 */
-                                                                                               /* 0x000122000c1e1900 */
-        /*01c0*/                   IMAD.WIDE.U32 R12, R12, R13, c[0x0][0x180] ;                /* 0x000060000c0c7625 */
-                                                                                               /* 0x000fc600078e000d */
-        /*01d0*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fec0000000000 */
-        /*01e0*/                   CS2R R8, SR_CLOCKLO ;                                       /* 0x0000000000087805 */
-                                                                                               /* 0x001fce0000015000 */
-        /*01f0*/                   CS2R R10, SRZ ;                                             /* 0x00000000000a7805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*0200*/                   CS2R R14, SRZ ;                                             /* 0x00000000000e7805 */
-                                                                                               /* 0x000fe2000001ff00 */
-        /*0210*/                   BAR.SYNC 0x0 ;                                              /* 0x0000000000007b1d */
-                                                                                               /* 0x000fec0000000000 */
-        /*0220*/                   HMMA.16816.F16 R18, R4.reuse, R18, R10 ;                    /* 0x000000120412723c */
-                                                                                               /* 0x050b70000000080a */
-        /*0230*/                   HMMA.16816.F16 R20, R4, R20, R14 ;                          /* 0x000000140414723c */
-                                                                                               /* 0x000b5e000000080e */
-        /*0240*/                   NOP ;                                                       /* 0x0000000000007918 */
-                                                                                               /* 0x000fd20000000000 */
-        /*0250*/                   CS2R R14, SR_CLOCKLO ;                                      /* 0x00000000000e7805 */
-                                                                                               /* 0x020fce0000015000 */
-        /*0260*/                   LEA R4, P0, R2, R12, 0x2 ;                                  /* 0x0000000c02047211 */
-                                                                                               /* 0x000fe200078010ff */
-        /*0270*/                   IMAD.MOV.U32 R11, RZ, RZ, 0x8 ;                             /* 0x00000008ff0b7424 */
-                                                                                               /* 0x000fc600078e00ff */
-        /*0280*/                   LEA.HI.X R5, R2, R13, R3, 0x2, P0 ;                         /* 0x0000000d02057211 */
-                                                                                               /* 0x000fe200000f1403 */
-        /*0290*/                   IMAD.WIDE.U32 R6, R0, R11, c[0x0][0x160] ;                  /* 0x0000580000067625 */
-                                                                                               /* 0x000fe200078e000b */
-        /*02a0*/                   MOV R3, 0x20 ;                                              /* 0x0000002000037802 */
-                                                                                               /* 0x000fc60000000f00 */
-        /*02b0*/                   STG.E [R4.64], R18 ;                                        /* 0x0000001204007986 */
-                                                                                               /* 0x000fe2000c101904 */
-        /*02c0*/                   IMAD.WIDE.U32 R10, R0, R11, c[0x0][0x168] ;                 /* 0x00005a00000a7625 */
-                                                                                               /* 0x000fc800078e000b */
-        /*02d0*/                   IMAD.WIDE.U32 R2, R3, 0x8, R4 ;                             /* 0x0000000803027825 */
-                                                                                               /* 0x000fca00078e0004 */
-        /*02e0*/                   STG.E [R2.64], R19 ;                                        /* 0x0000001302007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*02f0*/                   STG.E [R4.64+0x10], R20 ;                                   /* 0x0000101404007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*0300*/                   STG.E [R2.64+0x10], R21 ;                                   /* 0x0000101502007986 */
-                                                                                               /* 0x000fe8000c101904 */
-        /*0310*/                   STG.E.64 [R6.64], R8 ;                                      /* 0x0000000806007986 */
-                                                                                               /* 0x000fe8000c101b04 */
-        /*0320*/                   STG.E.64 [R10.64], R14 ;                                    /* 0x0000000e0a007986 */
-                                                                                               /* 0x000fe2000c101b04 */
-        /*0330*/                   EXIT ;                                                      /* 0x000000000000794d */
-                                                                                               /* 0x000fea0003800000 */
-        /*0340*/                   BRA 0x340;                                                  /* 0xfffffff000007947 */
-                                                                                               /* 0x000fc0000383ffff */
-        /*0350*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0360*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0370*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0380*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*0390*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03a0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03b0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03c0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03d0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03e0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-        /*03f0*/                   NOP;                                                        /* 0x0000000000007918 */
-                                                                                               /* 0x000fc00000000000 */
-		..........
-
-
-
-Fatbin ptx code:
-================
-arch = sm_86
-code version = [7,1]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt
deleted file mode 100644
index b453784a6..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt
+++ /dev/null
@@ -1,502 +0,0 @@
-
-Fatbin elf code:
-================
-arch = sm_70
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_70
-
-Fatbin elf code:
-================
-arch = sm_75
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_75
-
-Fatbin elf code:
-================
-arch = sm_80
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_80
-
-Fatbin elf code:
-================
-arch = sm_70
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_70
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj
-	.headerflags    @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
-        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;                          /* 0x00000a00ff017624 */
-                                                                                                    /* 0x000fd000078e00ff */
-        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;                                    /* 0x000000fffffff389 */
-                                                                                                    /* 0x000fe200000e00ff */
-        /*0020*/                   S2R R29, SR_LANEID ;                                             /* 0x00000000001d7919 */
-                                                                                                    /* 0x000e220000000000 */
-        /*0030*/                   IMAD.MOV.U32 R11, RZ, RZ, 0x2 ;                                  /* 0x00000002ff0b7424 */
-                                                                                                    /* 0x000fe400078e00ff */
-        /*0040*/                   IMAD.MOV.U32 R8, RZ, RZ, 0x10 ;                                  /* 0x00000010ff087424 */
-                                                                                                    /* 0x000fe200078e00ff */
-        /*0050*/                   S2R R28, SR_TID.X ;                                              /* 0x00000000001c7919 */
-                                                                                                    /* 0x000e680000002100 */
-        /*0060*/                   S2R R5, SR_CTAID.X ;                                             /* 0x0000000000057919 */
-                                                                                                    /* 0x000e620000002500 */
-        /*0070*/                   SHF.R.U32.HI R3, RZ, 0x2, R29.reuse ;                            /* 0x00000002ff037819 */
-                                                                                                    /* 0x101fe4000001161d */
-        /*0080*/                   SHF.R.U32.HI R2, RZ, 0x4, R29 ;                                  /* 0x00000004ff027819 */
-                                                                                                    /* 0x000fc4000001161d */
-        /*0090*/                   LOP3.LUT R3, R3, 0x3, RZ, 0xc0, !PT ;                            /* 0x0000000303037812 */
-                                                                                                    /* 0x000fe400078ec0ff */
-        /*00a0*/                   LOP3.LUT R29, R29, 0x3, RZ, 0xc0, !PT ;                          /* 0x000000031d1d7812 */
-                                                                                                    /* 0x000fe400078ec0ff */
-        /*00b0*/                   SHF.R.U32.HI R0, RZ, 0x1, R3 ;                                   /* 0x00000001ff007819 */
-                                                                                                    /* 0x000fe20000011603 */
-        /*00c0*/                   IMAD R28, R5, c[0x0][0x0], R28 ;                                 /* 0x00000000051c7a24 */
-                                                                                                    /* 0x002fe200078e021c */
-        /*00d0*/                   SHF.L.U32 R4, R3, 0x3, RZ ;                                      /* 0x0000000303047819 */
-                                                                                                    /* 0x000fe400000006ff */
-        /*00e0*/                   LOP3.LUT R6, R2, 0x1, RZ, 0xc0, !PT ;                            /* 0x0000000102067812 */
-                                                                                                    /* 0x000fe200078ec0ff */
-        /*00f0*/                   IMAD R9, R0, 0x8, R29.reuse ;                                    /* 0x0000000800097824 */
-                                                                                                    /* 0x100fe200078e021d */
-        /*0100*/                   LOP3.LUT R5, R4, 0x8, R29, 0xe2, !PT ;                           /* 0x0000000804057812 */
-                                                                                                    /* 0x000fe200078ee21d */
-        /*0110*/                   IMAD.SHL.U32 R32, R28, 0x1000, RZ ;                              /* 0x000010001c207824 */
-                                                                                                    /* 0x000fc400078e00ff */
-        /*0120*/                   IMAD R9, R6.reuse, 0x4, R9 ;                                     /* 0x0000000406097824 */
-                                                                                                    /* 0x040fe200078e0209 */
-        /*0130*/                   LEA R7, R6, R5, 0x2 ;                                            /* 0x0000000506077211 */
-                                                                                                    /* 0x000fe200078e10ff */
-        /*0140*/                   IMAD.WIDE.U32 R4, R32, R11, c[0x0][0x170] ;                      /* 0x00005c0020047625 */
-                                                                                                    /* 0x000fc600078e000b */
-        /*0150*/                   SHF.L.U32 R7, R7, 0x1, RZ ;                                      /* 0x0000000107077819 */
-                                                                                                    /* 0x000fe400000006ff */
-        /*0160*/                   SHF.L.U32 R9, R9, 0x1, RZ ;                                      /* 0x0000000109097819 */
-                                                                                                    /* 0x000fc600000006ff */
-        /*0170*/                   IMAD.WIDE.U32 R6, R7, 0x10, R4 ;                                 /* 0x0000001007067825 */
-                                                                                                    /* 0x000fc800078e0004 */
-        /*0180*/                   IMAD.WIDE.U32 R4, R9, R8, c[0x0][0x178] ;                        /* 0x00005e0009047625 */
-                                                                                                    /* 0x000fcc00078e0008 */
-        /*0190*/                   LDG.E.128.SYS R24, [R6] ;                                        /* 0x0000000006187381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01a0*/                   LDG.E.128.SYS R20, [R6+0x10] ;                                   /* 0x0000100006147381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01b0*/                   LDG.E.128.SYS R16, [R4] ;                                        /* 0x0000000004107381 */
-                                                                                                    /* 0x00012800001eed00 */
-        /*01c0*/                   LDG.E.128.SYS R12, [R4+0x10] ;                                   /* 0x00001000040c7381 */
-                                                                                                    /* 0x00012200001eed00 */
-        /*01d0*/                   IMAD.MOV.U32 R33, RZ, RZ, 0x4 ;                                  /* 0x00000004ff217424 */
-                                                                                                    /* 0x000fc800078e00ff */
-        /*01e0*/                   IMAD.WIDE.U32 R32, R32, R33, c[0x0][0x180] ;                     /* 0x0000600020207625 */
-                                                                                                    /* 0x000fe200078e0021 */
-        /*01f0*/                   NOP ;                                                            /* 0x0000000000007918 */
-                                                                                                    /* 0x000fe20000000000 */
-        /*0200*/                   BAR.SYNC 0x0 ;                                                   /* 0x0000000000007b1d */
-                                                                                                    /* 0x000fee0000000000 */
-        /*0210*/                   CS2R R30, SR_CLOCKLO ;                                           /* 0x00000000001e7805 */
-                                                                                                    /* 0x000fd00000015000 */
-        /*0220*/                   CS2R R8, SRZ ;                                                   /* 0x0000000000087805 */
-                                                                                                    /* 0x000fe2000001ff00 */
-        /*0230*/                   CS2R R10, SRZ ;                                                  /* 0x00000000000a7805 */
-                                                                                                    /* 0x000fe2000001ff00 */
-        /*0240*/                   NOP ;                                                            /* 0x0000000000007918 */
-                                                                                                    /* 0x000fe20000000000 */
-        /*0250*/                   BAR.SYNC 0x0 ;                                                   /* 0x0000000000007b1d */
-                                                                                                    /* 0x000fea0000000000 */
-        /*0260*/                   CS2R R4, SRZ ;                                                   /* 0x0000000000047805 */
-                                                                                                    /* 0x001fe2000001ff00 */
-        /*0270*/                   CS2R R6, SRZ ;                                                   /* 0x0000000000067805 */
-                                                                                                    /* 0x000fe2000001ff00 */
-        /*0280*/                   HMMA.884.F32.F32.STEP0 R8, R24.reuse.ROW, R16.reuse.COL, R8 ;    /* 0x0000001018087236 */
-                                                                                                    /* 0x0d0fe40000005408 */
-        /*0290*/                   HMMA.884.F32.F32.STEP1 R10, R24.reuse.ROW, R16.reuse.COL, R10 ;  /* 0x00000010180a7236 */
-                                                                                                    /* 0x0c0fe4000000d40a */
-        /*02a0*/                   HMMA.884.F32.F32.STEP2 R4, R24.reuse.ROW, R16.reuse.COL, R4 ;    /* 0x0000001018047236 */
-                                                                                                    /* 0x0c0fe40000015404 */
-        /*02b0*/                   HMMA.884.F32.F32.STEP3 R6, R24.ROW, R16.COL, R6 ;                /* 0x0000001018067236 */
-                                                                                                    /* 0x000f64000001d406 */
-        /*02c0*/                   HMMA.884.F32.F32.STEP0 R8, R26.reuse.ROW, R18.reuse.COL, R8 ;    /* 0x000000121a087236 */
-                                                                                                    /* 0x0e0fe40000005408 */
-        /*02d0*/                   HMMA.884.F32.F32.STEP1 R10, R26.reuse.ROW, R18.reuse.COL, R10 ;  /* 0x000000121a0a7236 */
-                                                                                                    /* 0x0c0fe4000000d40a */
-        /*02e0*/                   HMMA.884.F32.F32.STEP2 R4, R26.reuse.ROW, R18.reuse.COL, R4 ;    /* 0x000000121a047236 */
-                                                                                                    /* 0x0c0fe40000015404 */
-        /*02f0*/                   HMMA.884.F32.F32.STEP3 R6, R26.ROW, R18.COL, R6 ;                /* 0x000000121a067236 */
-                                                                                                    /* 0x000f64000001d406 */
-        /*0300*/                   HMMA.884.F32.F32.STEP0 R8, R20.reuse.ROW, R12.reuse.COL, R8 ;    /* 0x0000000c14087236 */
-                                                                                                    /* 0x0e0fe40000005408 */
-        /*0310*/                   HMMA.884.F32.F32.STEP1 R10, R20.reuse.ROW, R12.reuse.COL, R10 ;  /* 0x0000000c140a7236 */
-                                                                                                    /* 0x0c0fe4000000d40a */
-        /*0320*/                   HMMA.884.F32.F32.STEP2 R4, R20.reuse.ROW, R12.reuse.COL, R4 ;    /* 0x0000000c14047236 */
-                                                                                                    /* 0x0c0fe40000015404 */
-        /*0330*/                   HMMA.884.F32.F32.STEP3 R6, R20.ROW, R12.COL, R6 ;                /* 0x0000000c14067236 */
-                                                                                                    /* 0x000f64000001d406 */
-        /*0340*/                   HMMA.884.F32.F32.STEP0 R8, R22.reuse.ROW, R14.reuse.COL, R8 ;    /* 0x0000000e16087236 */
-                                                                                                    /* 0x0e0b640000005408 */
-        /*0350*/                   HMMA.884.F32.F32.STEP1 R10, R22.reuse.ROW, R14.reuse.COL, R10 ;  /* 0x0000000e160a7236 */
-                                                                                                    /* 0x0c0b64000000d40a */
-        /*0360*/                   HMMA.884.F32.F32.STEP2 R4, R22.reuse.ROW, R14.reuse.COL, R4 ;    /* 0x0000000e16047236 */
-                                                                                                    /* 0x0c0b640000015404 */
-        /*0370*/                   HMMA.884.F32.F32.STEP3 R6, R22.ROW, R14.COL, R6 ;                /* 0x0000000e16067236 */
-                                                                                                    /* 0x000b74000001d406 */
-        /*0380*/                   CS2R R12, SR_CLOCKLO ;                                           /* 0x00000000000c7805 */
-                                                                                                    /* 0x000fd00000015000 */
-        /*0390*/                   SHF.L.U32 R2, R2, 0x2, RZ ;                                      /* 0x0000000202027819 */
-                                                                                                    /* 0x000fe200000006ff */
-        /*03a0*/                   IMAD.SHL.U32 R14, R3, 0x8, RZ ;                                  /* 0x00000008030e7824 */
-                                                                                                    /* 0x020fc600078e00ff */
-        /*03b0*/                   LOP3.LUT R29, R2, 0x4, R29, 0xe2, !PT ;                          /* 0x00000004021d7812 */
-                                                                                                    /* 0x000fc800078ee21d */
-        /*03c0*/                   LOP3.LUT R3, R29.reuse, 0x2, RZ, 0xc0, !PT ;                     /* 0x000000021d037812 */
-                                                                                                    /* 0x040fe400078ec0ff */
-        /*03d0*/                   LOP3.LUT R29, R29, 0x5, RZ, 0xc0, !PT ;                          /* 0x000000051d1d7812 */
-                                                                                                    /* 0x000fe400078ec0ff */
-        /*03e0*/                   LEA R2, R0, R3, 0x3 ;                                            /* 0x0000000300027211 */
-                                                                                                    /* 0x000fe200078e18ff */
-        /*03f0*/                   IMAD.MOV.U32 R3, RZ, RZ, RZ ;                                    /* 0x000000ffff037224 */
-                                                                                                    /* 0x000fe200078e00ff */
-        /*0400*/                   LOP3.LUT R29, R14, 0x8, R29, 0xe2, !PT ;                         /* 0x000000080e1d7812 */
-                                                                                                    /* 0x000fca00078ee21d */
-        /*0410*/                   IMAD.WIDE.U32 R2, R29, 0x10, R2 ;                                /* 0x000000101d027825 */
-                                                                                                    /* 0x000fe200078e0002 */
-        /*0420*/                   MOV R29, 0x8 ;                                                   /* 0x00000008001d7802 */
-                                                                                                    /* 0x000fca0000000f00 */
-        /*0430*/                   IMAD.WIDE.U32 R16, R28, R29, c[0x0][0x160] ;                     /* 0x000058001c107625 */
-                                                                                                    /* 0x000fe200078e001d */
-        /*0440*/                   LEA R14, P0, R2, R32, 0x2 ;                                      /* 0x00000020020e7211 */
-                                                                                                    /* 0x000fc600078010ff */
-        /*0450*/                   IMAD.WIDE.U32 R28, R28, R29, c[0x0][0x168] ;                     /* 0x00005a001c1c7625 */
-                                                                                                    /* 0x000fe200078e001d */
-        /*0460*/                   LEA.HI.X R15, R2, R33, R3, 0x2, P0 ;                             /* 0x00000021020f7211 */
-                                                                                                    /* 0x000fd000000f1403 */
-        /*0470*/                   STG.E.64.SYS [R14], R8 ;                                         /* 0x000000080e007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*0480*/                   STG.E.64.SYS [R14+0x10], R4 ;                                    /* 0x000010040e007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*0490*/                   STG.E.64.SYS [R14+0x80], R10 ;                                   /* 0x0000800a0e007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*04a0*/                   STG.E.64.SYS [R14+0x90], R6 ;                                    /* 0x000090060e007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*04b0*/                   STG.E.64.SYS [R16], R30 ;                                        /* 0x0000001e10007386 */
-                                                                                                    /* 0x000fe8000010eb00 */
-        /*04c0*/                   STG.E.64.SYS [R28], R12 ;                                        /* 0x0000000c1c007386 */
-                                                                                                    /* 0x000fe2000010eb00 */
-        /*04d0*/                   EXIT ;                                                           /* 0x000000000000794d */
-                                                                                                    /* 0x000fea0003800000 */
-        /*04e0*/                   BRA 0x4e0;                                                       /* 0xfffffff000007947 */
-                                                                                                    /* 0x000fc0000383ffff */
-        /*04f0*/                   NOP;                                                             /* 0x0000000000007918 */
-                                                                                                    /* 0x000fc00000000000 */
-		...............................................
-
-
-
-Fatbin ptx code:
-================
-arch = sm_70
-code version = [7,0]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
-
-Fatbin elf code:
-================
-arch = sm_75
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_75
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj
-	.headerflags    @"EF_CUDA_SM75 EF_CUDA_PTX_SM(EF_CUDA_SM75)"
-        /*0000*/                   MOV R1, c[0x0][0x28] ;                                       /* 0x00000a0000017a02 */
-                                                                                                /* 0x000fd00000000f00 */
-        /*0010*/                   S2R R0, SR_TID.X ;                                           /* 0x0000000000007919 */
-                                                                                                /* 0x000e220000002100 */
-        /*0020*/                   MOV R19, RZ ;                                                /* 0x000000ff00137202 */
-                                                                                                /* 0x000fe20000000f00 */
-        /*0030*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x10 ;                              /* 0x00000010ff077424 */
-                                                                                                /* 0x000fe200078e00ff */
-        /*0040*/                   MOV R5, 0x2 ;                                                /* 0x0000000200057802 */
-                                                                                                /* 0x000fe20000000f00 */
-        /*0050*/                   S2R R3, SR_CTAID.X ;                                         /* 0x0000000000037919 */
-                                                                                                /* 0x000e280000002500 */
-        /*0060*/                   S2R R2, SR_LANEID ;                                          /* 0x0000000000027919 */
-                                                                                                /* 0x000e620000000000 */
-        /*0070*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                               /* 0x0000000003007a24 */
-                                                                                                /* 0x001fe200078e0200 */
-        /*0080*/                   LOP3.LUT R18, R2, 0x3, RZ, 0xc0, !PT ;                       /* 0x0000000302127812 */
-                                                                                                /* 0x002fc600078ec0ff */
-        /*0090*/                   IMAD.SHL.U32 R14, R0, 0x1000, RZ ;                           /* 0x00001000000e7824 */
-                                                                                                /* 0x000fe200078e00ff */
-        /*00a0*/                   SHF.R.U32.HI R3, RZ, 0x2, R2 ;                               /* 0x00000002ff037819 */
-                                                                                                /* 0x000fca0000011602 */
-        /*00b0*/                   IMAD.WIDE.U32 R18, R3, 0x8, R18 ;                            /* 0x0000000803127825 */
-                                                                                                /* 0x000fc800078e0012 */
-        /*00c0*/                   IMAD.WIDE.U32 R2, R14, R5, c[0x0][0x170] ;                   /* 0x00005c000e027625 */
-                                                                                                /* 0x000fc600078e0005 */
-        /*00d0*/                   LEA R8, P1, R18, c[0x0][0x178], 0x2 ;                        /* 0x00005e0012087a11 */
-                                                                                                /* 0x000fc800078210ff */
-        /*00e0*/                   LEA R4, P0, R18.reuse, R2, 0x2 ;                             /* 0x0000000212047211 */
-                                                                                                /* 0x040fe400078010ff */
-        /*00f0*/                   LEA.HI.X R9, R18.reuse, c[0x0][0x17c], R19.reuse, 0x2, P1 ;  /* 0x00005f0012097a11 */
-                                                                                                /* 0x140fe400008f1413 */
-        /*0100*/                   LEA.HI.X R5, R18, R3, R19, 0x2, P0 ;                         /* 0x0000000312057211 */
-                                                                                                /* 0x000fc600000f1413 */
-        /*0110*/                   IMAD.WIDE.U32 R10, R7, 0x10, R8 ;                            /* 0x00000010070a7825 */
-                                                                                                /* 0x000fc600078e0008 */
-        /*0120*/                   LDG.E.SYS R20, [R8] ;                                        /* 0x0000000008147381 */
-                                                                                                /* 0x00012200001ee900 */
-        /*0130*/                   IMAD.WIDE.U32 R6, R7, 0x10, R4 ;                             /* 0x0000001007067825 */
-                                                                                                /* 0x000fc600078e0004 */
-        /*0140*/                   LDG.E.SYS R21, [R8+0x10] ;                                   /* 0x0000100008157381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*0150*/                   LDG.E.SYS R22, [R10] ;                                       /* 0x000000000a167381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*0160*/                   LDG.E.SYS R23, [R10+0x10] ;                                  /* 0x000010000a177381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*0170*/                   LDG.E.SYS R12, [R4] ;                                        /* 0x00000000040c7381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*0180*/                   LDG.E.SYS R2, [R4+0x10] ;                                    /* 0x0000100004027381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*0190*/                   LDG.E.SYS R13, [R6] ;                                        /* 0x00000000060d7381 */
-                                                                                                /* 0x00012800001ee900 */
-        /*01a0*/                   LDG.E.SYS R3, [R6+0x10] ;                                    /* 0x0000100006037381 */
-                                                                                                /* 0x00012200001ee900 */
-        /*01b0*/                   MOV R15, 0x4 ;                                               /* 0x00000004000f7802 */
-                                                                                                /* 0x000fca0000000f00 */
-        /*01c0*/                   IMAD.WIDE.U32 R14, R14, R15, c[0x0][0x180] ;                 /* 0x000060000e0e7625 */
-                                                                                                /* 0x000fe200078e000f */
-        /*01d0*/                   BAR.SYNC 0x0 ;                                               /* 0x0000000000007b1d */
-                                                                                                /* 0x000fee0000000000 */
-        /*01e0*/                   CS2R R16, SR_CLOCKLO ;                                       /* 0x0000000000107805 */
-                                                                                                /* 0x000fd00000015000 */
-        /*01f0*/                   CS2R R8, SRZ ;                                               /* 0x0000000000087805 */
-                                                                                                /* 0x001fe2000001ff00 */
-        /*0200*/                   CS2R R10, SRZ ;                                              /* 0x00000000000a7805 */
-                                                                                                /* 0x000fe2000001ff00 */
-        /*0210*/                   CS2R R4, SRZ ;                                               /* 0x0000000000047805 */
-                                                                                                /* 0x000fe2000001ff00 */
-        /*0220*/                   CS2R R6, SRZ ;                                               /* 0x0000000000067805 */
-                                                                                                /* 0x000fe2000001ff00 */
-        /*0230*/                   BAR.SYNC 0x0 ;                                               /* 0x0000000000007b1d */
-                                                                                                /* 0x000fea0000000000 */
-        /*0240*/                   HMMA.1688.F32 R8, R12, R20, R8 ;                             /* 0x000000140c08723c */
-                                                                                                /* 0x010f700000001008 */
-        /*0250*/                   HMMA.1688.F32 R4, R12, R22, R4 ;                             /* 0x000000160c04723c */
-                                                                                                /* 0x000f700000001004 */
-        /*0260*/                   HMMA.1688.F32 R8, R2, R21, R8 ;                              /* 0x000000150208723c */
-                                                                                                /* 0x020b700000001008 */
-        /*0270*/                   HMMA.1688.F32 R4, R2, R23, R4 ;                              /* 0x000000170204723c */
-                                                                                                /* 0x000b5c0000001004 */
-        /*0280*/                   CS2R R20, SR_CLOCKLO ;                                       /* 0x0000000000147805 */
-                                                                                                /* 0x020fd00000015000 */
-        /*0290*/                   LEA R2, P0, R18, R14, 0x3 ;                                  /* 0x0000000e12027211 */
-                                                                                                /* 0x000fc800078018ff */
-        /*02a0*/                   LEA.HI.X R3, R18, R15, R19, 0x3, P0 ;                        /* 0x0000000f12037211 */
-                                                                                                /* 0x000fe200000f1c13 */
-        /*02b0*/                   IMAD.MOV.U32 R19, RZ, RZ, 0x8 ;                              /* 0x00000008ff137424 */
-                                                                                                /* 0x000fc800078e00ff */
-        /*02c0*/                   IMAD.WIDE.U32 R12, R19, 0x40, R2 ;                           /* 0x00000040130c7825 */
-                                                                                                /* 0x000fc600078e0002 */
-        /*02d0*/                   STG.E.64.SYS [R2], R8 ;                                      /* 0x0000000802007386 */
-                                                                                                /* 0x000fe2000010eb00 */
-        /*02e0*/                   IMAD.WIDE.U32 R14, R0, R19, c[0x0][0x160] ;                  /* 0x00005800000e7625 */
-                                                                                                /* 0x000fc800078e0013 */
-        /*02f0*/                   IMAD.WIDE.U32 R18, R0, R19, c[0x0][0x168] ;                  /* 0x00005a0000127625 */
-                                                                                                /* 0x000fe400078e0013 */
-        /*0300*/                   STG.E.64.SYS [R12], R10 ;                                    /* 0x0000000a0c007386 */
-                                                                                                /* 0x000fe8000010eb00 */
-        /*0310*/                   STG.E.64.SYS [R2+0x20], R4 ;                                 /* 0x0000200402007386 */
-                                                                                                /* 0x000fe8000010eb00 */
-        /*0320*/                   STG.E.64.SYS [R12+0x20], R6 ;                                /* 0x000020060c007386 */
-                                                                                                /* 0x000fe8000010eb00 */
-        /*0330*/                   STG.E.64.SYS [R14], R16 ;                                    /* 0x000000100e007386 */
-                                                                                                /* 0x000fe8000010eb00 */
-        /*0340*/                   STG.E.64.SYS [R18], R20 ;                                    /* 0x0000001412007386 */
-                                                                                                /* 0x000fe2000010eb00 */
-        /*0350*/                   EXIT ;                                                       /* 0x000000000000794d */
-                                                                                                /* 0x000fea0003800000 */
-        /*0360*/                   BRA 0x360;                                                   /* 0xfffffff000007947 */
-                                                                                                /* 0x000fc0000383ffff */
-        /*0370*/                   NOP;                                                         /* 0x0000000000007918 */
-                                                                                                /* 0x000fc00000000000 */
-		...............................................
-
-
-
-Fatbin ptx code:
-================
-arch = sm_75
-code version = [7,0]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
-
-Fatbin elf code:
-================
-arch = sm_80
-code version = [1,7]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-
-	code for sm_80
-		Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj
-	.headerflags    @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)"
-        /*0000*/                   MOV R1, c[0x0][0x28] ;                                        /* 0x00000a0000017a02 */
-                                                                                                 /* 0x000fce0000000f00 */
-        /*0010*/                   S2R R0, SR_TID.X ;                                            /* 0x0000000000007919 */
-                                                                                                 /* 0x000e220000002100 */
-        /*0020*/                   MOV R21, RZ ;                                                 /* 0x000000ff00157202 */
-                                                                                                 /* 0x000fe20000000f00 */
-        /*0030*/                   IMAD.MOV.U32 R9, RZ, RZ, 0x10 ;                               /* 0x00000010ff097424 */
-                                                                                                 /* 0x000fe200078e00ff */
-        /*0040*/                   MOV R5, 0x2 ;                                                 /* 0x0000000200057802 */
-                                                                                                 /* 0x000fe20000000f00 */
-        /*0050*/                   S2R R3, SR_CTAID.X ;                                          /* 0x0000000000037919 */
-                                                                                                 /* 0x000e220000002500 */
-        /*0060*/                   ULDC.64 UR4, c[0x0][0x118] ;                                  /* 0x0000460000047ab9 */
-                                                                                                 /* 0x000fc60000000a00 */
-        /*0070*/                   S2R R2, SR_LANEID ;                                           /* 0x0000000000027919 */
-                                                                                                 /* 0x000e620000000000 */
-        /*0080*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                                /* 0x0000000003007a24 */
-                                                                                                 /* 0x001fe200078e0200 */
-        /*0090*/                   LOP3.LUT R20, R2, 0x3, RZ, 0xc0, !PT ;                        /* 0x0000000302147812 */
-                                                                                                 /* 0x002fc600078ec0ff */
-        /*00a0*/                   IMAD.SHL.U32 R18, R0, 0x1000, RZ ;                            /* 0x0000100000127824 */
-                                                                                                 /* 0x000fe200078e00ff */
-        /*00b0*/                   SHF.R.U32.HI R3, RZ, 0x2, R2 ;                                /* 0x00000002ff037819 */
-                                                                                                 /* 0x000fca0000011602 */
-        /*00c0*/                   IMAD.WIDE.U32 R20, R3, 0x8, R20 ;                             /* 0x0000000803147825 */
-                                                                                                 /* 0x000fc800078e0014 */
-        /*00d0*/                   IMAD.WIDE.U32 R2, R18, R5, c[0x0][0x170] ;                    /* 0x00005c0012027625 */
-                                                                                                 /* 0x000fc600078e0005 */
-        /*00e0*/                   LEA R10, P1, R20, c[0x0][0x178], 0x2 ;                        /* 0x00005e00140a7a11 */
-                                                                                                 /* 0x000fc800078210ff */
-        /*00f0*/                   LEA R4, P0, R20.reuse, R2, 0x2 ;                              /* 0x0000000214047211 */
-                                                                                                 /* 0x040fe400078010ff */
-        /*0100*/                   LEA.HI.X R11, R20.reuse, c[0x0][0x17c], R21.reuse, 0x2, P1 ;  /* 0x00005f00140b7a11 */
-                                                                                                 /* 0x140fe400008f1415 */
-        /*0110*/                   LEA.HI.X R5, R20, R3, R21, 0x2, P0 ;                          /* 0x0000000314057211 */
-                                                                                                 /* 0x000fc600000f1415 */
-        /*0120*/                   IMAD.WIDE.U32 R16, R9.reuse, 0x10, R10 ;                      /* 0x0000001009107825 */
-                                                                                                 /* 0x040fe200078e000a */
-        /*0130*/                   LDG.E R6, [R10.64] ;                                          /* 0x000000040a067981 */
-                                                                                                 /* 0x000126000c1e1900 */
-        /*0140*/                   IMAD.WIDE.U32 R8, R9, 0x10, R4 ;                              /* 0x0000001009087825 */
-                                                                                                 /* 0x000fe200078e0004 */
-        /*0150*/                   LDG.E R7, [R10.64+0x10] ;                                     /* 0x000010040a077981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*0160*/                   LDG.E R2, [R16.64] ;                                          /* 0x0000000410027981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*0170*/                   LDG.E R3, [R16.64+0x10] ;                                     /* 0x0000100410037981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*0180*/                   LDG.E R12, [R4.64] ;                                          /* 0x00000004040c7981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*0190*/                   LDG.E R14, [R4.64+0x10] ;                                     /* 0x00001004040e7981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*01a0*/                   LDG.E R13, [R8.64] ;                                          /* 0x00000004080d7981 */
-                                                                                                 /* 0x000128000c1e1900 */
-        /*01b0*/                   LDG.E R15, [R8.64+0x10] ;                                     /* 0x00001004080f7981 */
-                                                                                                 /* 0x000122000c1e1900 */
-        /*01c0*/                   MOV R19, 0x4 ;                                                /* 0x0000000400137802 */
-                                                                                                 /* 0x000fca0000000f00 */
-        /*01d0*/                   IMAD.WIDE.U32 R18, R18, R19, c[0x0][0x180] ;                  /* 0x0000600012127625 */
-                                                                                                 /* 0x000fe200078e0013 */
-        /*01e0*/                   BAR.SYNC 0x0 ;                                                /* 0x0000000000007b1d */
-                                                                                                 /* 0x000fec0000000000 */
-        /*01f0*/                   CS2R R16, SR_CLOCKLO ;                                        /* 0x0000000000107805 */
-                                                                                                 /* 0x001fce0000015000 */
-        /*0200*/                   CS2R R8, SRZ ;                                                /* 0x0000000000087805 */
-                                                                                                 /* 0x000fe2000001ff00 */
-        /*0210*/                   CS2R R10, SRZ ;                                               /* 0x00000000000a7805 */
-                                                                                                 /* 0x000fe2000001ff00 */
-        /*0220*/                   CS2R R4, SRZ ;                                                /* 0x0000000000047805 */
-                                                                                                 /* 0x000fe2000001ff00 */
-        /*0230*/                   BAR.SYNC 0x0 ;                                                /* 0x0000000000007b1d */
-                                                                                                 /* 0x000fec0000000000 */
-        /*0240*/                   HMMA.16816.F32 R8, R12, R6, R8 ;                              /* 0x000000060c08723c */
-                                                                                                 /* 0x010b6e0000001808 */
-        /*0250*/                   CS2R R6, SRZ ;                                                /* 0x0000000000067805 */
-                                                                                                 /* 0x020fce000001ff00 */
-        /*0260*/                   HMMA.16816.F32 R4, R12, R2, R4 ;                              /* 0x000000020c04723c */
-                                                                                                 /* 0x000b5e0000001804 */
-        /*0270*/                   NOP ;                                                         /* 0x0000000000007918 */
-                                                                                                 /* 0x000fd20000000000 */
-        /*0280*/                   CS2R R2, SR_CLOCKLO ;                                         /* 0x0000000000027805 */
-                                                                                                 /* 0x020fce0000015000 */
-        /*0290*/                   LEA R12, P0, R20, R18, 0x3 ;                                  /* 0x00000012140c7211 */
-                                                                                                 /* 0x000fc800078018ff */
-        /*02a0*/                   LEA.HI.X R13, R20, R19, R21, 0x3, P0 ;                        /* 0x00000013140d7211 */
-                                                                                                 /* 0x000fe200000f1c15 */
-        /*02b0*/                   IMAD.MOV.U32 R21, RZ, RZ, 0x8 ;                               /* 0x00000008ff157424 */
-                                                                                                 /* 0x000fc800078e00ff */
-        /*02c0*/                   IMAD.WIDE.U32 R18, R21, 0x40, R12 ;                           /* 0x0000004015127825 */
-                                                                                                 /* 0x000fe200078e000c */
-        /*02d0*/                   STG.E.64 [R12.64], R8 ;                                       /* 0x000000080c007986 */
-                                                                                                 /* 0x000fe6000c101b04 */
-        /*02e0*/                   IMAD.WIDE.U32 R14, R0, R21, c[0x0][0x160] ;                   /* 0x00005800000e7625 */
-                                                                                                 /* 0x000fc600078e0015 */
-        /*02f0*/                   STG.E.64 [R18.64], R10 ;                                      /* 0x0000000a12007986 */
-                                                                                                 /* 0x000fe2000c101b04 */
-        /*0300*/                   IMAD.WIDE.U32 R20, R0, R21, c[0x0][0x168] ;                   /* 0x00005a0000147625 */
-                                                                                                 /* 0x000fc600078e0015 */
-        /*0310*/                   STG.E.64 [R12.64+0x20], R4 ;                                  /* 0x000020040c007986 */
-                                                                                                 /* 0x000fe8000c101b04 */
-        /*0320*/                   STG.E.64 [R18.64+0x20], R6 ;                                  /* 0x0000200612007986 */
-                                                                                                 /* 0x000fe8000c101b04 */
-        /*0330*/                   STG.E.64 [R14.64], R16 ;                                      /* 0x000000100e007986 */
-                                                                                                 /* 0x000fe8000c101b04 */
-        /*0340*/                   STG.E.64 [R20.64], R2 ;                                       /* 0x0000000214007986 */
-                                                                                                 /* 0x000fe2000c101b04 */
-        /*0350*/                   EXIT ;                                                        /* 0x000000000000794d */
-                                                                                                 /* 0x000fea0003800000 */
-        /*0360*/                   BRA 0x360;                                                    /* 0xfffffff000007947 */
-                                                                                                 /* 0x000fc0000383ffff */
-        /*0370*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*0380*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*0390*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03a0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03b0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03c0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03d0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03e0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-        /*03f0*/                   NOP;                                                          /* 0x0000000000007918 */
-                                                                                                 /* 0x000fc00000000000 */
-		...............................................
-
-
-
-Fatbin ptx code:
-================
-arch = sm_80
-code version = [7,0]
-producer = <unknown>
-host = linux
-compile_size = 64bit
-compressed
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu
deleted file mode 100644
index 79a2b739e..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "tensor_bw_half.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  if (deviceProp.major < 6) // tesnore unit was added since Volta
-    return 1;
-
-  std::cout << "FP16 operand, FP32 accumalte:\n";
-  tensor_max_flops<half, float>();
-
-  std::cout << "\nFP16 operand, FP16 accumalte:\n";
-  tensor_max_flops<half, half>();
-
-  // tensor_max_flops<char,int>();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h
deleted file mode 100644
index 2064c4df4..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef MAXFLOPS_TENSOR_DEF_H
-#define MAXFLOPS_TENSOR_DEF_H
-
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <mma.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 2048
-#define WMMA_M 16
-#define WMMA_N 16
-#define WMMA_K 16
-#define A_SIZE WMMA_M *WMMA_K
-#define B_SIZE WMMA_N *WMMA_K
-#define R_SIZE WMMA_M *WMMA_N
-
-using namespace nvcuda;
-
-template <class T, class R>
-__global__ void
-max_flops(uint64_t *startClk, uint64_t *stopClk, T *a, T *b, R *res,
-          uint32_t strid) { // strid set to 0 used to prevent optimization
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t gid = blockIdx.x * blockDim.x + tid;
-  uint32_t warpid = gid / warpSize;
-
-  a = a + warpid * A_SIZE;
-  b = b + warpid * B_SIZE;
-  res = res + warpid * R_SIZE;
-
-  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, T, wmma::row_major>
-      a_frag;
-  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, T, wmma::col_major>
-      b_frag;
-  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, R> c_frag;
-
-  wmma::load_matrix_sync(a_frag, a, 16);
-  wmma::fill_fragment(c_frag, 0.0f);
-  wmma::load_matrix_sync(b_frag, b, 16);
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_TIMES; ++j) {
-    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  wmma::store_matrix_sync(res, c_frag, WMMA_N, wmma::mem_row_major);
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-}
-
-template <class T, class R> float tensor_max_flops(bool report_fma_bw = false) {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  unsigned total_A_SIZE =
-      A_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
-  unsigned total_B_SIZE =
-      B_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
-  unsigned total_R_SIZE =
-      R_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
-  T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
-  R *res = (R *)malloc(total_R_SIZE * sizeof(R));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  T *data1_g;
-  T *data2_g;
-  R *res_g;
-
-  for (uint32_t i = 0; i < A_SIZE; i++) {
-    data1[i] = (T)i;
-  }
-
-  for (uint32_t i = 0; i < B_SIZE; i++) {
-    data2[i] = (T)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
-  gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
-  gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
-
-  gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
-                       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
-                       cudaMemcpyHostToDevice));
-
-  max_flops<T, R><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(
-      cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
-
-  float wmma_bw, hmma_bw, fma_bw;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  wmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS)) / (float)total_time;
-  hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
-            (float)total_time;
-  fma_bw = ((float)(REPEAT_TIMES * WMMA_M * WMMA_N * WMMA_K *
-                    (TOTAL_THREADS / WARP_SIZE))) /
-           (float)total_time;
-
-  std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
-  std::cout << "hmma SASS issue bandwidth = " << hmma_bw << "(thread/clk/SM)\n";
-  std::cout << "FMA tensor bandwidth = " << fma_bw << "(FMA/clk/SM)\n";
-
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  if (report_fma_bw)
-    return fma_bw;
-  else
-    return wmma_bw;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile
deleted file mode 100644
index 03b7e7ec8..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-GENCODE_SM50 :=
-GENCODE_SM61 :=
-GENCODE_SM30 :=
-GENCODE_SM35 :=
-GENCODE_SM60 :=
-GENCODE_SM62 :=
-
-SRC = tensor_lat_half.cu
-
-EXE = tensor_lat_half
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu
deleted file mode 100644
index 32245f805..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "tensor_lat_half.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  if (deviceProp.major < 6) // tesnore unit was added since Volta
-    return 1;
-
-  std::cout << "FP16 operand, FP32 accumalte:\n";
-  tensor_lat<half, float>();
-
-  std::cout << "\nFP16 operand, FP16 accumalte:\n";
-  tensor_lat<half, half>();
-
-  // tensor_lat<char,int>();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h
deleted file mode 100644
index 5e8c3cecc..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef LAT_TENSOR_DEF_H
-#define LAT_TENSOR_DEF_H
-
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <mma.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_ITERS 4096
-
-#define M_SIZE 16 * 16
-
-using namespace nvcuda;
-
-template <class T, class R>
-__global__ void tensor_latency(uint64_t *startClk, uint64_t *stopClk, T *a,
-                               T *b, R *res) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // register T result = 0;
-
-  wmma::fragment<wmma::matrix_a, 16, 16, 16, T, wmma::row_major> a_frag;
-  wmma::fragment<wmma::matrix_b, 16, 16, 16, T, wmma::col_major> b_frag;
-  wmma::fragment<wmma::accumulator, 16, 16, 16, R> c_frag;
-
-  wmma::load_matrix_sync(a_frag, a, 16);
-  wmma::fill_fragment(c_frag, 0.0f);
-  wmma::load_matrix_sync(b_frag, b, 16);
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  for (int j = 0; j < REPEAT_ITERS; ++j) {
-    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  wmma::store_matrix_sync(res, c_frag, 16, wmma::mem_row_major);
-
-  // write time and data back to memory
-  startClk[gid] = start;
-  stopClk[gid] = stop;
-}
-
-template <class T, class R> float tensor_lat() {
-
-  intilizeDeviceProp(0);
-
-  THREADS_PER_BLOCK = 1;
-  THREADS_PER_SM = 1;
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = 1;
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  T *data1 = (T *)malloc(M_SIZE * sizeof(T));
-  T *data2 = (T *)malloc(M_SIZE * sizeof(T));
-  R *res = (R *)malloc(TOTAL_THREADS * sizeof(R));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  T *data1_g;
-  T *data2_g;
-  R *res_g;
-
-  for (uint32_t i = 0; i < M_SIZE; i++) {
-    data1[i] = (T)i;
-    data2[i] = (T)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&data1_g, M_SIZE * sizeof(T)));
-  gpuErrchk(cudaMalloc(&data2_g, M_SIZE * sizeof(T)));
-  gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(R)));
-
-  gpuErrchk(
-      cudaMemcpy(data1_g, data1, M_SIZE * sizeof(T), cudaMemcpyHostToDevice));
-  gpuErrchk(
-      cudaMemcpy(data2_g, data2, M_SIZE * sizeof(T), cudaMemcpyHostToDevice));
-
-  tensor_latency<T, R><<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
-      startClk_g, stopClk_g, data1_g, data2_g, res_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  // gpuErrchk( cudaMemcpy(res, res_g, M_SIZE*sizeof(R), cudaMemcpyDeviceToHost)
-  // );
-
-  float wmma, hmma;
-  uint64_t total_time = stopClk[0] - startClk[0];
-  wmma = ((float)(total_time)) / ((float)(REPEAT_ITERS));
-  hmma =
-      ((float)(total_time)) / ((float)(REPEAT_ITERS * SASS_hmma_per_PTX_wmma));
-
-  std::cout << "wmma latency = " << wmma << "(clk)\n";
-  std::cout << "hmma latency = " << hmma << "(clk)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return wmma;
-}
-
-#endif
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile
deleted file mode 100644
index 2477bc631..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l1_access_grain.cu
-
-EXE = l1_access_grain
-
-NVCC_FLGAS = -Xptxas -dlcm=ca
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu
deleted file mode 100644
index ba1ee1524..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
-This benchmark measures L1 coalescing granularity for differnet strides
-check the nvprof or nvsight for received l1 reads and writes
-for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf
-
-run the program with nsight
- make nvsight ./l1_access_grain
-
-*/
-
-#include <cstdio>
-#include <iostream>
-#include <stdint.h>
-
-
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Device code
-__global__ void l1_stride_cons(const float *A, float *C, int stride)
-
-{
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  C[i * stride] = A[i * stride];
-}
-
-__global__ void l1_stride(const float *A, float *C, int stride)
-
-{
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  C[((i / stride) * 32) + (i % stride)] = A[((i / stride) * 32) + (i % stride)];
-}
-
-// Host code
-void coaslescer_stride(int N, int threadsPerBlock, int stride) {
-  // Variables
-  float *h_A;
-  float *h_C;
-
-  float *d_A;
-  float *d_C;
-
-  size_t size = N * sizeof(float) * 32;
-
-  // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
-  h_C = (float *)malloc(size);
-
-  // Initialize input vectors
-  for (uint32_t i = 0; i < N; i++)
-    h_A[i] = (float)i;
-
-  // Allocate vectors in device memory
-  gpuErrchk(cudaMalloc((void **)&d_A, size));
-  gpuErrchk(cudaMalloc((void **)&d_C, size));
-
-  // Copy vectors from host memory to device memory
-  gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
-
-  // Invoke kernel
-  int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock);
-
-  l1_stride<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, stride);
-  gpuErrchk(cudaPeekAtLastError());
-
-  // Copy result from device memory to host memory
-  // h_C contains the result in host memory
-  gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
-
-  // Free device memory
-  if (d_A)
-    cudaFree(d_A);
-  if (d_C)
-    cudaFree(d_C);
-
-  // Free host memory
-  if (h_A)
-    free(h_A);
-  if (h_C)
-    free(h_C);
-}
-//////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  intilizeDeviceProp(0);
-
-  for (int i = 1; i <= WARP_SIZE; ++i) {
-    coaslescer_stride(WARP_SIZE, WARP_SIZE, i);
-  }
-
-  std::cout << "\nThis benchmark measures coalescing granularity for differnet "
-               "strides.\n";
-  std::cout
-      << "check the nvprof or nvsight for received l1 reads and writes.\n";
-  std::cout
-      << "to run the program with nsight: make nvsight ./l1_access_grain\n";
-  std::cout
-      << "stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & "
-         "l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum\n\n";
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile
deleted file mode 100644
index eede7448b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_adaptive.cu
-
-EXE = l1_adaptive
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu
deleted file mode 100644
index 3e7bb8ea4..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-// Is L1 sector?
-
-#include <cuda.h>
-#include <fstream>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-__global__ void l1_adaptive() {}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  // TO DO
-  std::cout << "The ubench is not imepleneted yet.\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile
deleted file mode 100644
index 9447eef75..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = l1_associativity.cu
-
-EXE = l1_associativity
-
-#NVCC_FLGAS = -Xptxas -O0
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu
deleted file mode 100644
index dbc4e9e1c..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <time.h>
-
-#include "../../../hw_def/hw_def.h"
-
-class chaserParam {
-public:
-  uint32_t stride, array_size, iteration, l1_cache_size;
-  int shared_mem_size_byte;
-  bool sequential;
-
-  uint64_t start, stop;
-};
-
-__global__ void setup_kernel(curandStateMRG32k3a *state) {
-  int id = 0;
-
-  curand_init(1234, id, 0, &state[id]);
-}
-
-__global__ void l1_squential(uint64_t *startCLK, uint64_t *stopCLK,
-                             uint32_t *dsink, uint32_t *posArray,
-                             uint32_t stride, uint32_t array_size,
-                             uint32_t iteration) {
-  //    uint32_t tid = threadIdx.x;
-  //    uint32_t bid = blockIdx.x;
-  //    uint32_t uid = bid*blockDim.x+tid;
-  //    uint32_t n_threads = blockDim.x * gridDim.x;
-
-  uint64_t start, stop;
-  uint32_t pointer;
-  pointer = 0;
-
-  for (int itr = 0; itr < iteration; itr++) {
-    start = clock64();
-    for (uint32_t i = 0; i < (array_size / stride); i++) {
-      pointer = posArray[pointer];
-    }
-    stop = clock64();
-  }
-
-  startCLK[0] = start;
-  stopCLK[0] = stop;
-  dsink[0] = pointer;
-}
-
-__global__ void l1_random(uint64_t *startCLK, uint64_t *stopCLK,
-                          uint32_t *dsink, uint32_t *posArray,
-                          curandStateMRG32k3a *state, uint32_t stride,
-                          uint32_t array_size, uint32_t iteration) {
-  //    uint32_t tid = threadIdx.x;
-  //    uint32_t bid = blockIdx.x;
-  //    uint32_t uid = bid*blockDim.x+tid;
-  //    uint32_t n_threads = blockDim.x * gridDim.x;
-  uint64_t start, stop;
-  uint32_t pointer;
-  pointer = 0;
-
-  for (int itr = 0; itr < iteration; itr++) {
-    start = clock64();
-    for (uint32_t i = 0; i < (array_size / stride); i++) {
-      pointer =
-          posArray[(pointer + curand(state)) % array_size / stride * stride];
-    }
-    stop = clock64();
-  }
-
-  startCLK[0] = start;
-  stopCLK[0] = stop;
-  dsink[0] = pointer;
-}
-
-void l1_structure(chaserParam &chaser) {
-
-  uint64_t *startCLK = (uint64_t *)malloc(1 * sizeof(uint64_t));
-  uint64_t *stopCLK = (uint64_t *)malloc(1 * sizeof(uint64_t));
-  uint32_t *dsink = (uint32_t *)malloc(1 * sizeof(uint32_t));
-  uint32_t *posArray = (uint32_t *)malloc(chaser.array_size * sizeof(uint32_t));
-
-  for (uint32_t i = 0; i < chaser.array_size; i++)
-    posArray[i] = (i + chaser.stride) % chaser.array_size;
-
-  uint64_t *startCLK_g;
-  uint64_t *stopCLK_g;
-  uint32_t *dsink_g;
-  uint32_t *posArray_g;
-
-  gpuErrchk(cudaMalloc(&startCLK_g, 1 * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopCLK_g, 1 * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, 1 * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, chaser.array_size * sizeof(uint32_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray,
-                       chaser.array_size * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
-
-  if (chaser.sequential) {
-    l1_squential<<<1, 1>>>(startCLK_g, stopCLK_g, dsink_g, posArray_g,
-                           chaser.stride, chaser.array_size, chaser.iteration);
-  } else {
-    curandStateMRG32k3a *devMRGStates;
-    gpuErrchk(cudaMalloc((void **)&devMRGStates, sizeof(curandStateMRG32k3a)));
-    setup_kernel<<<1, 1>>>(devMRGStates);
-    l1_random<<<1, 1>>>(startCLK_g, stopCLK_g, dsink_g, posArray_g,
-                        devMRGStates, chaser.stride, chaser.array_size,
-                        chaser.iteration);
-  }
-
-  // gpuErrchk( cudaPeekAtLastError() );
-
-  gpuErrchk(cudaMemcpy(startCLK, startCLK_g, 1 * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopCLK, stopCLK_g, 1 * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(
-      cudaMemcpy(dsink, dsink_g, 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
-
-  chaser.start = startCLK[0];
-  chaser.stop = stopCLK[0];
-
-  free(startCLK);
-  free(stopCLK);
-  free(dsink);
-  free(posArray);
-  gpuErrchk(cudaFree(startCLK_g));
-  gpuErrchk(cudaFree(stopCLK_g));
-  gpuErrchk(cudaFree(dsink_g));
-  gpuErrchk(cudaFree(posArray_g));
-
-  return;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  std::cout << "Launching L1 cache line size ubench" << std::endl;
-  std::ostringstream oss;
-  oss << "L1line.csv";
-  std::string filename = oss.str();
-  std::ofstream myfile1(filename);
-
-  chaserParam chaser1;
-  chaser1.shared_mem_size_byte = 0;
-  chaser1.iteration = 1;
-  chaser1.array_size = L1_SIZE / 4;
-  chaser1.sequential = true;
-
-  myfile1 << "chaser.stride,chaser.start,chaser.stop\n";
-  for (uint32_t i = 1; i <= 32; i *= 2) {
-    chaser1.stride = i;
-    l1_structure(chaser1);
-    myfile1 << chaser1.stride << "," << chaser1.start << "," << chaser1.stop
-            << "\n";
-  }
-
-  std::cout << "Saving L1 cache line size data at L1line.csv" << std::endl;
-
-  std::cout << "Launching L1 cache assoc ubench" << std::endl;
-  std::ostringstream string;
-  string << "L1asso.csv";
-  filename = string.str();
-  std::ofstream myfile2(filename);
-
-  chaser1.iteration = 2;
-  chaser1.sequential = false;
-  // chaser1.array_size=L1_SIZE*8; //4096KB 32xl1size
-  myfile2 << "chaser.stride,chaser.start,chaser.stop\n";
-  for (uint32_t i = 8; i <= 128; i *= 2) {
-    chaser1.stride = i;
-    chaser1.array_size = L1_SIZE / 16 * i;
-    l1_structure(chaser1);
-    myfile2 << chaser1.stride << "," << chaser1.start << "," << chaser1.stop
-            << "\n";
-  }
-
-  std::cout << "Saving L1 cache assoc data at L1asso.csv" << std::endl;
-  myfile1.close();
-  myfile2.close();
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx
deleted file mode 100644
index 72e57db36..000000000
Binary files a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx and /dev/null differ
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile
deleted file mode 100644
index 726f7eecd..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_banks.cu
-
-EXE = l1_banks
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu
deleted file mode 100644
index 5b0f388dc..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-// Is L1 sector?
-
-#include <cuda.h>
-#include <fstream>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-__global__ void l1_banks() {}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  // TO DO
-  std::cout << "The ubench is not imepleneted yet.\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile
deleted file mode 100644
index bc762132f..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_bw_128.cu
-
-EXE = l1_bw_128
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu
deleted file mode 100644
index 3bc7d3efd..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L1 cache for 32 bit
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 256
-// array size is half the L1 size (2) * float size (4)
-#define ARRAY_SIZE (L1_SIZE / 8)
-
-__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink,
-                      float *posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-  float sink1 = 0;
-  float sink2 = 0;
-  float sink3 = 0;
-
-  // warp up L1 cache
-  for (uint32_t i = tid * 4; i < ARRAY_SIZE; i += blockDim.x * 4) {
-    float *ptr = posArray + i;
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data<4>;\n\t"
-                 "ld.global.ca.v4.f32 {data0,data1,data2,data3}, [%4];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "add.f32 %2, data2, %2;\n\t"
-                 "add.f32 %3, data3, %3;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t j = 0; j < REPEAT_TIMES; j++) {
-    float *ptr = posArray + ((tid * 4 + (j * warpSize * 4)) % ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f32 data<4>;\n\t"
-                 "ld.global.ca.v4.f32 {data0,data1,data2,data3}, [%4];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "add.f32 %2, data2, %2;\n\t"
-                 "add.f32 %3, data3, %3;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1 + sink2 + sink3;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  // ARRAY_SIZE has to be less than L1_SIZE
-  assert(ARRAY_SIZE * sizeof(float) < L1_SIZE);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  float *posArray_g;
-  float *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-
-  l1_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  double bw, BW;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(float) * 4) /
-       ((double)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile
deleted file mode 100644
index 51c68cf48..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-SRC = l1_bw_32f.cu
-
-EXE = l1_bw_32f
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu
deleted file mode 100644
index 6abaac82b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu
+++ /dev/null
@@ -1,142 +0,0 @@
-/* This code is a modification of L1 cache benchmark from
-"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
- https://arxiv.org/pdf/1804.06826.pdf
-
- This benchmark measures the maximum read bandwidth of L1 cache for 64 bit
-*/
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 256
-// array size is half the L1 size (2) * float size (4)
-#define ARRAY_SIZE L1_SIZE / 8
-
-__global__ void l1_bw(uint64_t *__restrict__ startClk,
-                      uint64_t *__restrict__ stopClk, float *__restrict__ dsink,
-                      const float *__restrict__ posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-  float sink1 = 0;
-  float sink2 = 0;
-  float sink3 = 0;
-
-  // populate l1 cache to warm up
-  for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) {
-    // float* ptr = &posArray[i];
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.ca.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink0)
-                 : "l"(&posArray[i])
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t j = 0; j < REPEAT_TIMES; j++) {
-    // float* ptr = posArray + ((tid + (j*warpSize*4))%ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f32 data<4>;\n\t"
-                 "ld.global.ca.f32 data0, [%4+0];\n\t"
-                 "ld.global.ca.f32 data1, [%4+128];\n\t"
-                 "ld.global.ca.f32 data2, [%4+256];\n\t"
-                 "ld.global.ca.f32 data3, [%4+384];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "add.f32 %2, data2, %2;\n\t"
-                 "add.f32 %3, data3, %3;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3)
-                 : "l"(&posArray[(tid + (j * warpSize * 4)) % ARRAY_SIZE])
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1 + sink2 + sink3;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  assert(ARRAY_SIZE * sizeof(float) <
-         L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  float *posArray_g;
-  float *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  l1_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float bw, BW;
-  uint64_t total_time;
-  total_time = *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-               *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  // total_time = stopClk[0]-startClk[0];
-  bw = (float)(REPEAT_TIMES * THREADS_PER_SM * 4 * 4) / ((float)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile
deleted file mode 100644
index 487ecccfc..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_bw_32f_unroll.cu
-
-EXE = l1_bw_32f_unroll
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu
deleted file mode 100644
index b2be1f138..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L1 cache for 64 bit
-
-#include <assert.h>
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-// array size is half the L1 size (2) * float size (4)
-#define ARRAY_SIZE L1_SIZE / 8
-#define REPEAT_TIMES 1024
-
-__global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, float *dsink,
-                      float *posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-  float sink1 = 0;
-  float sink2 = 0;
-  float sink3 = 0;
-
-  // populate l1 cache to warm up
-  for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) {
-    float *ptr = posArray + i;
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.ca.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink0)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t j = 0; j < REPEAT_TIMES; j++) {
-    float *ptr = posArray + ((tid + (j * warpSize)) % ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.ca.f32 data, [%1+0];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink0)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1 + sink2 + sink3;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  assert(ARRAY_SIZE * sizeof(float) <
-         L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  float *posArray_g;
-  float *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  l1_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float bw;
-  bw = (float)(REPEAT_TIMES * THREADS_PER_SM * 4) /
-       ((float)(stopClk[0] - startClk[0]));
-  printf("L1 bandwidth = %f (byte/clk/SM)\n", bw);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile
deleted file mode 100644
index 67df7821a..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_bw_64f.cu
-
-EXE = l1_bw_64f
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu
deleted file mode 100644
index 78097b9fc..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L1 cache for 64 bit
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 256
-// array size is half the L1 size (2) * double size (8)
-#define ARRAY_SIZE (L1_SIZE / 16)
-
-__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, double *dsink,
-                      double *posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  double sink0 = 0;
-  double sink1 = 0;
-
-  // populate l1 cache to warm up
-  for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) {
-    double *ptr = posArray + i;
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f64 data;\n\t"
-                 "ld.global.ca.f64 data, [%1];\n\t"
-                 "add.f64 %0, data, %0;\n\t"
-                 "}"
-                 : "+d"(sink0)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t j = 0; j < REPEAT_TIMES; j++) {
-    double *ptr = posArray + ((tid + (j * warpSize * 2)) % ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f64 data<2>;\n\t"
-                 "ld.global.ca.f64 data0, [%2+0];\n\t"
-                 "ld.global.ca.f64 data1, [%2+256];\n\t"
-                 "add.f64 %0, data0, %0;\n\t"
-                 "add.f64 %1, data1, %1;\n\t"
-                 "}"
-                 : "+d"(sink0), "+d"(sink1)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1;
-}
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  // ARRAY_SIZE has to be less than L1_SIZE
-  assert(ARRAY_SIZE * sizeof(double) < L1_SIZE);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  double *posArray = (double *)malloc(ARRAY_SIZE * sizeof(double));
-  double *dsink = (double *)malloc(TOTAL_THREADS * sizeof(double));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  double *posArray_g;
-  double *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (double)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(double)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(double)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(double),
-                       cudaMemcpyHostToDevice));
-
-  l1_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyDeviceToHost));
-
-  double bw, BW;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(double) * 2) /
-       ((double)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile
deleted file mode 100644
index 31b27b0b1..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_bw_64v.cu
-
-EXE = l1_bw_64v
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu
deleted file mode 100644
index 8f8b13ff0..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* This code is a modification of L1 cache benchmark from
-"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
- https://arxiv.org/pdf/1804.06826.pdf
-
-This benchmark measures the maximum read bandwidth of L1 cache for 64-bit vector
-*/
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 256
-// array size is half the L1 size (2) * float size (4)
-#define ARRAY_SIZE (L1_SIZE / 8)
-
-__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink,
-                      float *posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-  float sink1 = 0;
-
-  // populate l1 cache to warm up
-  for (uint32_t i = tid * 2; i < ARRAY_SIZE; i += blockDim.x * 2) {
-    float *ptr = posArray + i;
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data<2>;\n\t"
-                 "ld.global.ca.v2.f32 {data0,data1}, [%2];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t j = 0; j < REPEAT_TIMES; j++) {
-    float *ptr = posArray + ((tid * 2 + (j * warpSize * 2)) % ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f32 data<2>;\n\t"
-                 "ld.global.ca.v2.f32 {data0,data1}, [%2];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1;
-}
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  // ARRAY_SIZE has to be less than L1_SIZE
-  assert(ARRAY_SIZE * sizeof(float) < L1_SIZE);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  float *posArray_g;
-  float *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  l1_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  double bw, BW;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(float) * 2) /
-       ((double)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile
deleted file mode 100644
index 634228307..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_config.cu
-
-EXE = l1_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu
deleted file mode 100644
index 0b3bc9376..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <iostream>
-#include <sstream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-/*
-We know the below information from running our ubench, we copy and paste the
-ubench results below manullay
-TODO: we will automate this process
-*/
-
-// We know cache line size from l1_assoc ubench
-#define L1_CACHE_LINE_SIZE 128
-
-// We know #sets from l1_assoc ubench (the l1 cache has 4 sets, since kepler
-// and in volta and turing)
-#define L1_CACHE_SETS 4
-
-// we know sector size from l1_assoc and l1_acces_grain ubenches and has
-// been consistent over generations, change it accordingly
-#define L1_SECTOR_SIZE 32
-
-// we know the mshr throughput from l1_mshr ubench
-// we find that each warp can issue up to two pending cache lines (8 sector
-// reqs)
-#define L1_ACCESS_FACTOR L1_CACHE_LINE_SIZE / L1_SECTOR_SIZE
-#define L1_MSHR_ENTRIES_PER_WARP L1_ACCESS_FACTOR * 2
-
-// L1 cache cache in Volta and above is write allocate, subsector write, write-
-// through we know that from l1_write_policy ubench and has been consistent
-// after Volta. Change it accordingly if it changes in new generations
-static const char *After_Volta_L1_Cache_Write_Policy = ",L:T:m:L:L,";
-
-// L1 cache bfore Volta was write-no-allocate, write-evict with only local
-// accsses to be write-back
-static const char *Before_Volta_L1_Cache_Write_Policy = ",L:L:m:N:L,";
-
-// Adaptive cache config option
-static const char *SHMEM_ADAPTIVE_OPTION = "0,8,16,32,64";
-
-int main() {
-  intilizeDeviceProp(0);
-
-  if (ACCEL_SIM_MODE) {
-
-    std::cout << "\n//Accel_Sim config: \n";
-
-    bool adaptive_cache;
-    string cache_write_string;
-    string adaptive_shmem_option_string;
-    unsigned write_cache_ratio;
-    unsigned unified_l1d_size_inKB;
-    unsigned config_l1_size;
-    // l1 cache is sector since pascal
-    char is_sector = (deviceProp.major >= 6) ? 'S' : 'N';
-    // for volta and above, l1 is write allocate and adative
-    if (deviceProp.major >= 7) {
-      // configure based on min l1 cache
-      // l1 cache is adpative
-      adaptive_cache = true;
-      adaptive_shmem_option_string = SHMEM_ADAPTIVE_OPTION;
-      std::stringstream large_shmem_size;
-      unsigned shd_mem_inKB = deviceProp.sharedMemPerMultiprocessor / 1024;
-      large_shmem_size << "," << shd_mem_inKB;
-      adaptive_shmem_option_string += large_shmem_size.str();
-      unified_l1d_size_inKB = L1_SIZE / 1024;
-      //increase unified cache by 32KB in case the shd is larger
-      //this case happens in Turing, we need to write ubench to get the exact size
-      if(unified_l1d_size_inKB <= shd_mem_inKB)
-        unified_l1d_size_inKB = unified_l1d_size_inKB + 32;
-      // set l1 write allocation policy (write allocate, write through)
-      cache_write_string = After_Volta_L1_Cache_Write_Policy;
-      // L1 write-to-read ratio (25%) based on rodinia kmeans workload
-      // benchmarking
-      write_cache_ratio = 25;
-      //always configure l1 as 32KB in adaptive cache
-      //accel-sim will adjust the assoc adpatively during run-time
-      config_l1_size = 32*1024;
-      //ensure unified cache is multiple of l1 cache size
-      assert((unified_l1d_size_inKB*1024) % config_l1_size == 0);
-    } else {
-      adaptive_cache = false;
-      cache_write_string = Before_Volta_L1_Cache_Write_Policy;
-      write_cache_ratio = 0;
-      unified_l1d_size_inKB = L1_SIZE / 1024;
-      config_l1_size = L1_SIZE;
-    }
-
-    // lines per set
-    unsigned assoc = config_l1_size / L1_CACHE_LINE_SIZE / L1_CACHE_SETS;
-
-    unsigned warps_num_per_sm = MAX_THREADS_PER_SM / WARP_SIZE;
-    // each warp can issue up to two pending cache lines (this is based on our
-    // l1_mshr ubench)
-    unsigned mshr = warps_num_per_sm * L1_MSHR_ENTRIES_PER_WARP;
-
-    std::cout << "-gpgpu_adaptive_cache_config " << adaptive_cache << std::endl;
-    std::cout << "-gpgpu_shmem_option " << adaptive_shmem_option_string
-              << std::endl;
-    std::cout << "-gpgpu_unified_l1d_size " << unified_l1d_size_inKB << std::endl;
-    std::cout << "-gpgpu_l1_banks " << WARP_SCHEDS_PER_SM << std::endl;
-    std::cout << "-gpgpu_cache:dl1 " << is_sector << ":" << L1_CACHE_SETS << ":"
-              << L1_CACHE_LINE_SIZE << ":" << assoc << cache_write_string
-              << "A:" << mshr << ":" << warps_num_per_sm << ",16:0,32"
-              << std::endl;
-    std::cout << "-gpgpu_gmem_skip_L1D " << !deviceProp.globalL1CacheSupported
-              << std::endl;
-    std::cout << "-gpgpu_l1_cache_write_ratio " << write_cache_ratio
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile
deleted file mode 100644
index 31a4026db..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-SRC = l1_lat.cu
-
-EXE = l1_lat
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu
deleted file mode 100644
index 2ccae4806..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "l1_lat.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  float lat = l1_lat();
-
-  if (ACCEL_SIM_MODE) {
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_l1_latency " << (unsigned)lat << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h
deleted file mode 100644
index 408a50c54..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the latency of L1 cache
-
-#include <assert.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cuda.h>
-
-#include "../../../hw_def/hw_def.h"
-
-// Launch only one thread to calcaulte the latency using a pointer-chasing
-// array technique
-#define THREADS_NUM 1
-#define REPEAT_TIMES 32768 // iterate over the array ITERS times
-#define ARRAY_SIZE 4096    // size of the array
-
-// Measure latency of ITERS reads.
-__global__ void l1_lat(uint32_t *startClk, uint32_t *stopClk,
-                       uint64_t *posArray, uint64_t *dsink) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-
-  // one thread to initialize the pointer-chasing array
-  if (tid == 0) {
-    for (uint32_t i = 0; i < (ARRAY_SIZE - 1); i++)
-      posArray[i] = (uint64_t)(posArray + i + 1);
-
-    posArray[ARRAY_SIZE - 1] = (uint64_t)posArray;
-  }
-
-  if (tid < THREADS_NUM) {
-    // a register to avoid compiler optimization
-    uint64_t *ptr = posArray + tid;
-    uint64_t ptr1, ptr0;
-
-    // initialize the thread pointer with the start address of the array
-    // use ca modifier to cache the in L1
-    asm volatile("{\t\n"
-                 "ld.global.ca.u64 %0, [%1];\n\t"
-                 "}"
-                 : "=l"(ptr1)
-                 : "l"(ptr)
-                 : "memory");
-
-    // synchronize all threads
-    asm volatile("bar.sync 0;");
-
-    // start timing
-    uint32_t start = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-    // pointer-chasing ITERS times
-    // use ca modifier to cache the load in L1
-    for (uint32_t i = 0; i < REPEAT_TIMES; ++i) {
-      asm volatile("{\t\n"
-                   "ld.global.ca.u64 %0, [%1];\n\t"
-                   "}"
-                   : "=l"(ptr0)
-                   : "l"((uint64_t *)ptr1)
-                   : "memory");
-      ptr1 = ptr0; // swap the register for the next load
-    }
-
-    // stop timing
-    uint32_t stop = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-    // write time and data back to memory
-    startClk[tid] = start;
-    stopClk[tid] = stop;
-    dsink[tid] = ptr1;
-  }
-}
-
-float l1_lat() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
-
-  assert(ARRAY_SIZE * sizeof(uint64_t) < L1_SIZE);
-
-  uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *posArray_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t)));
-
-  l1_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-
-  float lat = (float)(stopClk[0] - startClk[0]) / REPEAT_TIMES;
-  printf("L1 Latency  = %12.4f cycles\n", lat);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return lat;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile
deleted file mode 100644
index 9fdfa43be..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_mshr.cu
-
-EXE = l1_mshr
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu
deleted file mode 100644
index 8d0418fda..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-#include <cuda.h>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#include "../../../hw_def/hw_def.h"
-
-__global__ void l1_mshr(uint64_t *timing, uint32_t *dsink, uint32_t *posArray,
-                        uint32_t stride, uint64_t array_size,
-                        uint32_t iteration) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-  // uint32_t n_threads = blockDim.x * gridDim.x;
-  // uint32_t record_length = MAX_SHARED_MEM_SIZE_PER_BLOCK/8;
-
-  extern __shared__ uint32_t t_val[]; // size of shared memory
-
-  uint32_t pointer;
-  pointer = uid * 1024;
-  asm volatile("bar.sync 0;");
-
-  // pointer chasing
-  for (int itr = 0; itr < iteration; itr++) {
-    pointer = posArray[pointer];
-    t_val[uid * iteration + itr] = clock64();
-  }
-  // pointer chasing completed
-  for (uint32_t i = 0; i < iteration; i++) {
-    timing[uid * iteration + i] = t_val[uid * iteration + i];
-  }
-
-  dsink[uid] = pointer;
-}
-
-void l1_structure(uint32_t stride, uint64_t array_size,
-                  int shared_mem_size_byte, uint32_t iteration) {
-
-  std::ostringstream oss;
-  oss << "MSHR" << stride << "_array" << array_size << "_shmem"
-      << (shared_mem_size_byte / 4) << "_itr" << iteration << ".csv";
-  std::string filename = oss.str();
-  std::ofstream myfile(filename);
-
-  std::cout << "Launching L1 MSHR ubench" << std::endl;
-
-  uint64_t *timing =
-      (uint64_t *)malloc(TOTAL_THREADS * iteration * sizeof(uint64_t));
-  uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *posArray = (uint32_t *)malloc(array_size * sizeof(uint32_t));
-  // uint32_t *val_array = (uint32_t*) malloc(array_size*sizeof(uint32_t));
-
-  for (uint32_t i = 0; i < array_size; i++)
-    posArray[i] = (i + stride) % array_size;
-
-  uint64_t *timing_g;
-  uint32_t *dsink_g;
-  uint32_t *posArray_g;
-
-  gpuErrchk(
-      cudaMalloc(&timing_g, TOTAL_THREADS * iteration * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, array_size * sizeof(uint32_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, array_size * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
-
-  // cudaFuncSetAttribute(l1_mshr,
-  // cudaFuncAttributePreferredSharedMemoryCarveout, 100); //set shared memory
-  // size
-  cudaFuncSetAttribute(l1_mshr, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                       shared_mem_size_byte);
-  l1_mshr<<<BLOCKS_NUM, THREADS_PER_BLOCK, shared_mem_size_byte>>>(
-      timing_g, dsink_g, posArray_g, stride, array_size, iteration);
-
-  // gpuErrchk( cudaPeekAtLastError() );
-
-  gpuErrchk(cudaMemcpy(timing, timing_g,
-                       TOTAL_THREADS * iteration * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-
-  myfile << "thread_num,timing1,timing2,timing3,timing4,timing5,timing6\n";
-  for (uint32_t thr = 0; thr < TOTAL_THREADS; thr += 32) {
-    for (uint32_t itr = 0; itr < iteration; itr++) {
-      if (itr != 0) {
-        myfile << ",";
-
-      } else {
-        myfile << thr << ",";
-      }
-      myfile << timing[thr * iteration + itr];
-    }
-    myfile << "\n";
-  }
-
-  free(timing);
-  free(dsink);
-  free(posArray);
-  gpuErrchk(cudaFree(timing_g));
-  gpuErrchk(cudaFree(dsink_g));
-  gpuErrchk(cudaFree(posArray_g));
-
-  myfile.close();
-  std::cout << "Saving L1 MSHR data at " << filename << std::endl;
-
-  return;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t stride, iteration;
-  int shared_mem_size_byte = MAX_SHARED_MEM_SIZE_PER_BLOCK;
-  /*
-          #ifdef VOLTA_HW_DEF_H
-          uint32_t l1_cache_size = L1_SIZE-shared_mem_size_byte; //volta
-     sharedmem is a partition of L1 #else uint32_t l1_cache_size = L1_SIZE;
-          #endif
-  */
-  // measure line size and mshr
-  stride = 100;
-  iteration = 6;
-  uint64_t array_size = 1024 * 1024 * 1024;
-  l1_structure(stride, array_size, shared_mem_size_byte, iteration);
-
-  /*
-  //measure associativity
-  stride = 8;
-  iteration = 1;
-  for (array_size=l1_cache_size/4; array_size<(l1_cache_size+512)/4;
-  array_size++){
-          //l1_structure (stride, array_size, shared_mem_size_byte, iteration);
-  }
-  */
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx
deleted file mode 100644
index a8a130503..000000000
Binary files a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx and /dev/null differ
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile
deleted file mode 100644
index c865f2de3..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_sector.cu
-
-EXE = l1_sector
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu
deleted file mode 100644
index 0d2aaedbf..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-// Is L1 sector?
-
-#include <cuda.h>
-#include <fstream>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-using namespace std;
-
-#define L1_SIZE_FLOAT L1_SIZE / 4
-// allocate large array size, larger than L1 size
-#define ARRAY_SIZE L1_SIZE_FLOAT * 2
-// we know the sector size is 8 floats (32B) from the l1_access_grain ubench
-#define SECTOR_SIZE 8
-
-#include "../../../hw_def/hw_def.h"
-
-__global__ void l1_sector(uint32_t *startClk, uint32_t *stopClk, float *dsink,
-                          float *posArray) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-
-  // populate l1 cache to warm up
-  for (uint32_t i = tid; i < L1_SIZE_FLOAT; i += blockDim.x) {
-    float *ptr = posArray + i;
-    // use ca modifier to cache the load in L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.ca.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink0)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // kicks out one of the cache line and read a sector
-  if (uid == 0) {
-    sink0 += posArray[L1_SIZE_FLOAT + 1];
-  }
-
-  asm volatile("bar.sync 0;");
-
-  uint32_t start = 0;
-  uint32_t stop = 0;
-
-  // start timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  float *ptr = posArray + tid * SECTOR_SIZE;
-  asm volatile("{\t\n"
-               ".reg .f32 data;\n\t"
-               "ld.global.ca.f32 data, [%1];\n\t"
-               "add.f32 %0, data, %0;\n\t"
-               "}"
-               : "+f"(sink0)
-               : "l"(ptr)
-               : "memory");
-
-  // stop timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  float *posArray_g;
-  float *dsink_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  std::cout << "Launching L1 sector ubench" << std::endl;
-
-  l1_sector<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                               posArray_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  ofstream myfile;
-  myfile.open("data.csv");
-  myfile << "sectror_id, lat" << endl;
-  for (unsigned i = 0; i < TOTAL_THREADS; i++) {
-    myfile << i << "," << stopClk[i] - startClk[i] << endl;
-  }
-
-  std::cout << "Saving L1 sector data at data.csv" << std::endl;
-
-  myfile.close();
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile
deleted file mode 100644
index e5c3c78ec..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l1_shared_bw.cu
-
-EXE = l1_shared_bw
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu
deleted file mode 100644
index e16b2124d..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-// This benchmark measures the maximum read bandwidth of shared memory and L1 at
-// the same time
-
-#include <assert.h>
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-// array size is half the L1 size (2) * float size (4)
-#define ARRAY_SIZE (L1_SIZE / 8)
-// 32 KB of shd memory
-#define SHARED_MEM_SIZE (32 * 1024 / 4)
-#define ITERS 4096
-
-__global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk,
-                          uint32_t *dsink, uint32_t *l1, uint32_t stride) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-  uint32_t n_threads = blockDim.x * gridDim.x;
-
-  register uint32_t tmp_s = uid;
-  register uint32_t tmp_l1 = uid;
-  uint32_t start = 0;
-  uint32_t stop = 0;
-
-  __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory
-  // uint32_t s[SHARED_MEM_SIZE];
-  // one thread to initialize the pointer-chasing array
-  for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads)
-    s[i] = (i + stride + 7) % SHARED_MEM_SIZE;
-
-  // warmup l1 cache
-  for (uint32_t i = 0; i < ARRAY_SIZE; ++i) {
-    tmp_l1 = l1[tmp_l1];
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  // load data from l1 cache and accumulate
-  for (uint32_t i = 0; i < ITERS; ++i) {
-    tmp_s = s[tmp_s];
-    tmp_l1 = l1[tmp_l1];
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // sink0 = tmp;
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = tmp_s + tmp_l1;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-
-  uint32_t *posArray = (uint32_t *)malloc(ARRAY_SIZE * sizeof(uint32_t));
-
-  uint32_t stride = 1024;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (i + stride + 1) % ARRAY_SIZE;
-
-  uint32_t *posArray_g;
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint32_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint32_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
-
-  shared_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                               posArray_g, stride);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-
-  double bw;
-  bw = (double)(ITERS * TOTAL_THREADS * 4 * 2) /
-       ((double)(stopClk[0] - startClk[0]));
-  printf("Shared Memory Bandwidth = %f (byte/clk/SM)\n", bw);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile
deleted file mode 100644
index 7855682c9..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = l1_write_policy.cu
-
-EXE = l1_write_policy
-
-NVCC_FLGAS = -Xptxas -dlcm=ca
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu
deleted file mode 100644
index c92ef1572..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-This microbenchmark detects L1 write policy
-check the nvprof or nvsight for received l1 reads and writes to detect the
-policy check the comments below for further details and also see our arvix
-paper: https://arxiv.org/pdf/1810.07269.pdf
-
- to run the program with nvsight
- make nvsight ./l1_write_policy
-*/
-
-#include <assert.h>
-#include <cstdio>
-#include <iostream>
-#include <stdint.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-#define THREADS_NUM 1   // Launch only one thread
-#define ARRAY_SIZE 1024 // size of the array
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Device code
-
-/*
-check the nvprof or nvsight to see the L1 reads and write hits
-in the below mb, we have 6 reads and 4 writes
-
-1. Check the write allocation policy
- we have three policies: write no-allocate vs write-allocate fetch-on-write vs
-vs write-allocate sub-sector write?? if only two write hits (C[i] and A[i] at
-lines 3&4) ==> then write no-allocate else if three write hits (C[i+1], C[i] and
-A[i] at lines 2&3&4) then it is write-allocate But is it fect-on-write or
-sub-sector fetch-on-read ? if one read miss (A[i] at line1) and 5 reads hits ==>
-then fetch-on-write, as the miss at line1 will fetch the whole sector, and C[i]
-and line 3 is hit else if two read misses (A[i] at lines 1 and C[i] at line 3)
-==> then sub-sector write with write bit-mask, the sector will be fetched on
-read miss, not write miss
-
-2. check if write-back or write-through
-check the L2 writes, if four writes are received ==> then write-through
-if less than four writes ==> then write-back
-
-to run the program with nvsight
- make nvsight ./l1_write_policy
-stats to look at:
-l1 reads: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum
-l1 writes:l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum
-l1 read hits:l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum
-l1 write hits: l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum
-
-The comments below shows a case of write-allocate with sub-sector mask as in
-Pascal, Volta, Turing and Ampere HW Results found: Pascal: write-no allocate +
-write-through Volta, Turing and Ampere: write allocate & sub-sector write +
-write-through
-*/
-
-__global__ void write_policy_mb(float *A, float *C) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i == 0) {
-    C[i] = A[i];        // write to C[i] is a miss (cache line is missing)
-    C[i + 1] = A[i];    // write to C[i+1] is a hit (cache line is found)
-    C[i] = C[i] + A[i]; // read of C[i] is a miss (entire sector is missing,
-                        // fetch it from memory)
-    A[i] =
-        C[i] + C[i + 1]; // read C[i] and C[i+1] are hits (entire sector exists)
-  }
-}
-
-//////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
-
-  // create 4KB buffers of A&C
-  assert(ARRAY_SIZE * sizeof(float) < L1_SIZE);
-
-  float *A = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *C = (float *)malloc(ARRAY_SIZE * sizeof(float));
-
-  float *A_g;
-  float *C_g;
-
-  gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float)));
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    A[i] = (float)i;
-
-  gpuErrchk(
-      cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-
-  write_policy_mb<<<1, THREADS_NUM>>>(A_g, C_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-
-  std::cout << "\nThis microbenchmark detects L1 write policy.\n";
-  std::cout << "check the nvprof or nvsight for received l1 reads and writes "
-               "to detect the policy.\n";
-  std::cout << "see the code comments for further details\n";
-  std::cout
-      << "to run the program with nvsight: make nvsight ./l1_write_policy\n";
-  std::cout
-      << "stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & "
-         "l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum & "
-         "l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum & "
-         "l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum \n\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile
deleted file mode 100644
index 810ff0e80..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l2_access_grain.cu
-
-EXE = l2_access_grain
-
-NVCC_FLGAS = -Xptxas -dlcm=cg
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu
deleted file mode 100644
index 1c6bac182..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
-This benchmark measures l2 access granularity for differnet strides
-check the nvprof or nvsight for received l2 reads and writes
-for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf
-
-Compile this file using the following command to disable L1 cache:
-    nvcc -Xptxas -dlcm=cg l2_sector_grain.cu
-
-run the program with nsight
- make nvsight ./l2_access_grain
-
- Result: All Nvidia HW generation since kepler has 32B access granularity
- */
-
-#include <assert.h>
-#include <cstdio>
-#include <iostream>
-#include <stdint.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Device code
-__global__ void l2_stride_cons(const float *A, float *C, int stride)
-
-{
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  C[i * stride] = A[i * stride];
-}
-
-__global__ void l2_stride(const float *A, float *C, int stride)
-
-{
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  C[((i / stride) * 32) + (i % stride)] = A[((i / stride) * 32) + (i % stride)];
-}
-
-// Host code
-void l2_stride(int N, int threadsPerBlock, int stride) {
-  // Variables
-  float *h_A;
-  float *h_C;
-
-  float *d_A;
-  float *d_C;
-
-  size_t size = N * sizeof(float) * 32;
-
-  // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
-  h_C = (float *)malloc(size);
-
-  // fill array
-  for (uint32_t i = 0; i < N; i++)
-    h_A[i] = (float)i;
-
-  // Allocate vectors in device memory
-  gpuErrchk(cudaMalloc((void **)&d_A, size));
-  gpuErrchk(cudaMalloc((void **)&d_C, size));
-
-  // Copy vectors from host memory to device memory
-  gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
-
-  // Invoke kernel
-  int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock);
-
-  l2_stride<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, stride);
-  gpuErrchk(cudaPeekAtLastError());
-
-  // Copy result from device memory to host memory
-  // h_C contains the result in host memory
-  gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
-
-  // Free device memory
-  if (d_A)
-    cudaFree(d_A);
-  if (d_C)
-    cudaFree(d_C);
-
-  // Free host memory
-  if (h_A)
-    free(h_A);
-  if (h_C)
-    free(h_C);
-}
-//////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  intilizeDeviceProp(0);
-
-  for (int i = 1; i <= WARP_SIZE; ++i)
-    l2_stride(WARP_SIZE, WARP_SIZE, i);
-
-  std::cout << "\nThis benchmark measures l2 access granularity for differnet "
-               "strides.\n";
-  std::cout << "check the nvprof or nvsight for received l2 reads and write.\n";
-  std::cout
-      << "to run the program with nsight: make nvsight ./l2_access_grain\n";
-  std::cout << "stats to look at: lts__t_sectors_srcunit_tex_op_read.sum and "
-               "lts__t_sectors_srcunit_tex_op_write.sum \n\n";
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile
deleted file mode 100644
index 39ad775b7..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l2_bw_128.cu
-
-EXE = l2_bw_128
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu
deleted file mode 100644
index f12591f45..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-// This code is a modification of L2 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L2 cache for 32f
-// Compile this file using the following command to disable L1 cache:
-//    nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 256
-
-/*
-L2 cache is warmed up by loading posArray and adding sink
-Start timing after warming up
-Load posArray and add sink to generate read traffic
-Repeat the previous step while offsetting posArray by one each iteration
-Stop timing and store data
-*/
-
-__global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink,
-                      float *posArray, unsigned ARRAY_SIZE) {
-  // block and thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink0 = 0;
-  float sink1 = 0;
-  float sink2 = 0;
-  float sink3 = 0;
-
-  // warm up l2 cache
-  for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) {
-    float *ptr = posArray + i;
-    // every warp loads all data in l2 cache
-    // use cg modifier to cache the load in L2 and bypass L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.cg.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink0)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l2 cache and accumulate,
-  for (uint32_t i = 0; i < REPEAT_TIMES; i++) {
-    float *ptr = posArray + (((i * warpSize * 4) + uid * 4) % ARRAY_SIZE);
-    asm volatile("{\t\n"
-                 ".reg .f32 data<4>;\n\t"
-                 "ld.global.cg.v4.f32 {data0,data1,data2,data3}, [%4];\n\t"
-                 "add.f32 %0, data0, %0;\n\t"
-                 "add.f32 %1, data1, %1;\n\t"
-                 "add.f32 %2, data2, %2;\n\t"
-                 "add.f32 %3, data3, %3;\n\t"
-                 "}"
-                 : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3)
-                 : "l"(ptr)
-                 : "memory");
-  }
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // store the result
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = sink0 + sink1 + sink2 + sink3;
-}
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  unsigned ARRAY_SIZE = TOTAL_THREADS * 4 + REPEAT_TIMES * WARP_SIZE * 4;
-  // Array size must not exceed L2 size
-  assert(ARRAY_SIZE * sizeof(float) < L2_SIZE);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  float *posArray_g;
-  float *dsink_g;
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-
-  assert(ARRAY_SIZE < L2_SIZE);
-
-  for (int i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  l2_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g, ARRAY_SIZE);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float bw, BW;
-  unsigned long long data =
-      (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(float) * 4;
-  uint64_t total_time = stopClk[0] - startClk[0];
-  bw = (float)(data) / ((float)(stopClk[0] - startClk[0]));
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n";
-  float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) *
-                 L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE;
-  BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), "
-            << BW << "(GB/s)\n";
-  std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile
deleted file mode 100644
index 37f8c3a92..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = l2_bw_32f.cu
-
-EXE = l2_bw_32f
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
deleted file mode 100644
index 9d4aa80d7..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
+++ /dev/null
@@ -1,141 +0,0 @@
-// This code is a modification of L2 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L2 cache for 32f
-// Compile this file using the following command to disable L1 cache:
-//    nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 2048
-
-/*
-L2 cache is warmed up by loading posArray and adding sink
-Start timing after warming up
-Load posArray and add sink to generate read traffic
-Repeat the previous step while offsetting posArray by one each iteration
-Stop timing and store data
-*/
-
-__global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink,
-                      float *posArray, unsigned ARRAY_SIZE) {
-  // block and thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  float sink = 0;
-
-  // warm up l2 cache
-  for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) {
-    float *ptr = posArray + i;
-    // every warp loads all data in l2 cache
-    // use cg modifier to cache the load in L2 and bypass L1
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.cg.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint64_t start = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from l2 cache and accumulate,
-  for (uint32_t i = 0; i < REPEAT_TIMES; i++) {
-    float *ptr = posArray + (i * warpSize) + uid;
-    asm volatile("{\t\n"
-                 ".reg .f32 data;\n\t"
-                 "ld.global.cg.f32 data, [%1];\n\t"
-                 "add.f32 %0, data, %0;\n\t"
-                 "}"
-                 : "+f"(sink)
-                 : "l"(ptr)
-                 : "memory");
-  }
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint64_t stop = 0;
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // store the result
-  startClk[bid * blockDim.x + tid] = start;
-  stopClk[bid * blockDim.x + tid] = stop;
-  dsink[bid * blockDim.x + tid] = sink;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  unsigned ARRAY_SIZE = TOTAL_THREADS + REPEAT_TIMES * WARP_SIZE;
-  assert(ARRAY_SIZE * sizeof(float) <
-         L2_SIZE); // Array size must not exceed L2 size
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-
-  float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float));
-
-  float *posArray_g;
-  float *dsink_g;
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-
-  for (int i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (float)i;
-
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float)));
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float),
-                       cudaMemcpyHostToDevice));
-
-  l2_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g, ARRAY_SIZE);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-
-  float bw, BW;
-  unsigned long long data =
-      (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(float);
-  uint64_t total_time = stopClk[0] - startClk[0];
-  // uint64_t total_time =
-  // *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS]);
-  bw = (float)(data) / ((float)(total_time));
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n";
-  float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) *
-                 L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE;
-  BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), "
-            << BW << "(GB/s)\n";
-  std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile
deleted file mode 100644
index 131ec359c..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l2_bw_64f.cu
-
-EXE = l2_bw_64f
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu
deleted file mode 100644
index 64f69c506..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu
+++ /dev/null
@@ -1,140 +0,0 @@
-// This code is a modification of L2 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the maximum read bandwidth of L2 cache for 64 bit
-// Compile this file using the following command to disable L1 cache:
-//    nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu
-
-#include <algorithm>
-#include <assert.h>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define REPEAT_TIMES 2048
-
-/*
-L2 cache is warmed up by loading posArray and adding sink
-Start timing after warming up
-Load posArray and add sink to generate read traffic
-Repeat the previous step while offsetting posArray by one each iteration
-Stop timing and store data
-*/
-
-__global__ void l2_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink,
-                      double *posArray, unsigned ARRAY_SIZE) {
-  // block and thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-
-  // a register to avoid compiler optimization
-  double sink = 0;
-
-  // warm up l2 cache
-  for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) {
-    double *ptr = posArray + i;
-    // every warp loads all data in l2 cache
-    // use cg modifier to cache the load in L2 and bypass L1
-    asm volatile("{\t\n"
-                 ".reg .f64 data;\n\t"
-                 "ld.global.cg.f64 data, [%1];\n\t"
-                 "add.f64 %0, data, %0;\n\t"
-                 "}"
-                 : "+d"(sink)
-                 : "l"(ptr)
-                 : "memory");
-  }
-
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  // benchmark starts
-  // load data from l2 cache and accumulate,
-  for (uint32_t i = 0; i < REPEAT_TIMES; i++) {
-    double *ptr = posArray + (i * warpSize) + uid;
-    asm volatile("{\t\n"
-                 ".reg .f64 data;\n\t"
-                 "ld.global.cg.f64 data, [%1];\n\t"
-                 "add.f64 %0, data, %0;\n\t"
-                 "}"
-                 : "+d"(sink)
-                 : "l"(ptr)
-                 : "memory");
-  }
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // store the result
-  startClk[bid * blockDim.x + tid] = start;
-  stopClk[bid * blockDim.x + tid] = stop;
-  dsink[bid * blockDim.x + tid] = sink;
-}
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  unsigned ARRAY_SIZE = TOTAL_THREADS + REPEAT_TIMES * WARP_SIZE;
-  // Array size must not exceed L2 size
-  assert(ARRAY_SIZE * sizeof(double) < L2_SIZE);
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-
-  double *posArray = (double *)malloc(ARRAY_SIZE * sizeof(double));
-  double *dsink = (double *)malloc(TOTAL_THREADS * sizeof(double));
-
-  double *posArray_g;
-  double *dsink_g;
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-
-  for (int i = 0; i < ARRAY_SIZE; i++)
-    posArray[i] = (double)i;
-
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(double)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(double)));
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(double),
-                       cudaMemcpyHostToDevice));
-
-  l2_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                           posArray_g, ARRAY_SIZE);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(double),
-                       cudaMemcpyDeviceToHost));
-
-  float bw, BW;
-  unsigned long long data =
-      (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(double);
-  uint64_t total_time = stopClk[0] - startClk[0];
-  bw = (float)(data) / ((float)(total_time));
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n";
-  float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) *
-                 L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE;
-  BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), "
-            << BW << "(GB/s)\n";
-  std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile
deleted file mode 100644
index 281ce63e2..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = l2_config.cu
-
-EXE = l2_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu
deleted file mode 100644
index 677bc2ba5..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-// We know the below information from running our ubench, we copy and paste the
-// ubench results below manullay
-// TODO: we will automate this process
-
-// we know sector size from l2_access_grain ubench
-#define L2_CACHE_LINE_SIZE 128
-#define L2_SECTOR_SIZE 32
-#define IS_SECTOR 1
-
-// It is hard to know the exact l2 assoc from ubenhmarking
-// Thus, based on previous work, we assume assoc is constant and = 16
-// similar to AMD GPU:
-// https://www.techpowerup.com/gpu-specs/docs/amd-gcn1-architecture.pdf
-#define L2_CACHE_ASSOC 16
-
-// L2 cache cache since kepler and above is write-allocate, subsector-write,
-// write-back. We know that from l2_write_policy ubench and has been consistent
-// since kepler. Change it accordingly if it changes in new generations
-static const char *L2_Cache_Write_Policy = ",L:B:m:L:";
-
-// For now, accel-sim only supoprts ipoly for 64 and less
-#define ACCELSIM_IPOLY_HASH_SUPPORT 64
-// 8 byte for icnt control
-#define ACCELSIM_ICNT_CONTROL 8
-
-int main() {
-  intilizeDeviceProp(0);
-
-  if (deviceProp.l2CacheSize) {
-    printf("L2 Cache Size = %.0f MB\n",
-           static_cast<float>(deviceProp.l2CacheSize / 1048576.0f));
-  }
-
-  unsigned mem_channel = get_num_channels(MEM_BITWIDTH, DRAM_MODEL);
-  unsigned l2_banks_num = mem_channel * L2_BANKS_PER_MEM_CHANNEL;
-
-  std::cout << "L2 Banks number = " << l2_banks_num << std::endl;
-
-  if (ACCEL_SIM_MODE) {
-
-    std::cout << "\n//Accel_Sim config: \n";
-
-    unsigned l2_size_per_bank = L2_SIZE / l2_banks_num;
-    unsigned assoc, sets_num;
-    char set_indexing = 'L'; // by default assume linear indexing
-    char is_sector = IS_SECTOR ? 'S' : 'N';
-    if (isPowerOfTwo(l2_size_per_bank)) {
-      assoc = L2_CACHE_ASSOC;
-      sets_num = l2_size_per_bank / L2_CACHE_LINE_SIZE / assoc;
-      if (sets_num <= ACCELSIM_IPOLY_HASH_SUPPORT)
-        set_indexing = 'P';
-      else
-        set_indexing = 'X'; // bitwise xoring
-    } else {
-      // if not power of two, assume it is 24, as most NVidia GPU L2 cache size
-      // that is not power of two, is actually divisble by 24
-      assoc = 24;
-      // ensure that our assumption is true
-      assert((l2_size_per_bank / L2_CACHE_LINE_SIZE) % assoc == 0);
-      sets_num = l2_size_per_bank / L2_CACHE_LINE_SIZE / assoc;
-      if (isPowerOfTwo(sets_num) && l2_banks_num <= ACCELSIM_IPOLY_HASH_SUPPORT)
-        set_indexing = 'P';
-      else if (isPowerOfTwo(sets_num))
-        set_indexing = 'X'; // bitwise xoring
-    }
-
-    std::cout << "-gpgpu_n_sub_partition_per_mchannel "
-              << L2_BANKS_PER_MEM_CHANNEL << std::endl;
-    std::cout << "-icnt_flit_size "
-              << L2_BANK_WIDTH_in_BYTE + ACCELSIM_ICNT_CONTROL
-              << std::endl; // 8bytes for control
-    if (isPowerOfTwo(l2_banks_num) &&
-        l2_banks_num <= ACCELSIM_IPOLY_HASH_SUPPORT)
-      std::cout << "-gpgpu_memory_partition_indexing 2" << std::endl;
-    else
-      std::cout << "-gpgpu_memory_partition_indexing 0" << std::endl;
-    std::cout << "-gpgpu_cache:dl2 " << is_sector << ":" << sets_num << ":"
-              << L2_CACHE_LINE_SIZE << ":" << assoc << L2_Cache_Write_Policy
-              << set_indexing << ","
-              << "A:192:4,32:0,32" << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile
deleted file mode 100644
index 784f28d0e..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l2_copy_engine.cu
-
-EXE = l2_copy_engine
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu
deleted file mode 100644
index bcb4988b3..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-// This ubench meaures if DMA memory copy is cached in L2 by default
-
-#include <assert.h>
-#include <cuda.h>
-#include <iostream> // std::abs
-#include <math.h>   // std::abs
-#include <numeric>  // std::accumulate
-#include <stdio.h>
-#include <stdlib.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-#include "../l2_lat/l2_lat.h"
-
-#define REPEAT_TIMES 32768 // iterate over the array ITERS times
-#define ARRAY_SIZE_L2 32768
-
-__global__ void l2_lat_no_warmpu(uint32_t *startClk, uint32_t *stopClk,
-                                 uint64_t *posArray, uint64_t *dsink) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-
-  // do pointer-chasing without warmpup
-  if (tid == 0) {
-
-    uint64_t *ptr = posArray + tid;
-    uint64_t ptr1, ptr0;
-
-    // initialize the pointers with the start address
-    // use cg modifier to cache the load in L2 and bypass L1
-    asm volatile("{\t\n"
-                 "ld.global.cg.u64 %0, [%1];\n\t"
-                 "}"
-                 : "=l"(ptr1)
-                 : "l"(ptr)
-                 : "memory");
-
-    // synchronize all threads
-    asm volatile("bar.sync 0;");
-
-    // start timing
-    uint32_t start = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-    // pointer-chasing ITERS times
-    // use cg modifier to cache the load in L2 and bypass L1
-    for (uint32_t i = 0; i < REPEAT_TIMES; ++i) {
-      asm volatile("{\t\n"
-                   "ld.global.cg.u64 %0, [%1];\n\t"
-                   "}"
-                   : "=l"(ptr0)
-                   : "l"((uint64_t *)ptr1)
-                   : "memory");
-      ptr1 = ptr0; // swap the register for the next load
-    }
-
-    // stop timing
-    uint32_t stop = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-    // write time and data back to memory
-    startClk[tid] = start;
-    stopClk[tid] = stop;
-    dsink[tid] = ptr1;
-  }
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  unsigned THREADS_NUM = 1;
-
-  // Array size must not exceed L2 size
-  assert(ARRAY_SIZE_L2 * sizeof(uint64_t) < L2_SIZE);
-
-  uint64_t *posArray = (uint64_t *)malloc(ARRAY_SIZE_L2 * sizeof(uint64_t));
-  uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *posArray_g;
-  uint64_t *dsink_g;
-
-  uint64_t stride = 1;
-
-  gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE_L2 * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t)));
-
-  // initilze pointer-chasing on the CPU side
-  for (uint64_t i = 0; i < ARRAY_SIZE_L2; i++) {
-    uint64_t *tmp = posArray_g + ((i + stride) % ARRAY_SIZE_L2);
-    posArray[i] = (uint64_t)tmp;
-  }
-
-  gpuErrchk(cudaMemcpy(posArray_g, posArray, sizeof(uint64_t) * ARRAY_SIZE_L2,
-                       cudaMemcpyHostToDevice));
-
-  // here we measure the latency of the request without warmup
-  l2_lat_no_warmpu<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-                                       dsink_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  float l2_nowarmp_lat = (float)(stopClk[0] - startClk[0]) / REPEAT_TIMES;
-  printf("L2 Latency no-warmp up = %12.4f cycles \n", l2_nowarmp_lat);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  // then we measure L2 hit latncy with warmpup
-  float l2_hit_lat2 = l2_hit_lat();
-
-  // if the latency is close to the l2 hit latency, then the memcpy are cached
-  // by default at L2
-  float error = (abs(l2_nowarmp_lat - l2_hit_lat2) / l2_hit_lat2) * 100;
-  bool cached = (error < 10.0f);
-  if (cached)
-    printf("Is memcpy cached in L2? Yes, error=%2.1f\n", error);
-  else
-    printf("Is memcpy cached in L2? No, error=%2.1f\n", error);
-
-  if (ACCEL_SIM_MODE) {
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_perf_sim_memcpy " << cached << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile
deleted file mode 100644
index 13a411fdf..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = l2_lat.cu
-
-EXE = l2_lat
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu
deleted file mode 100644
index d54508fcd..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "../../l1_cache/l1_lat/l1_lat.h"
-#include "l2_lat.h"
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  float lat2 = l2_hit_lat();
-
-  if (ACCEL_SIM_MODE) {
-    float lat1 = l1_lat();
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_l2_rop_latency " << (unsigned)(lat2 - lat1)
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h
deleted file mode 100644
index d09381516..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the latency of L2 latency using pointer-chasing
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cuda.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define ITERS 32768 // iterate over the array ITERS times
-#define ARRAY_SIZE 4096
-
-__global__ void l2_hit_lat(uint32_t *startClk, uint32_t *stopClk,
-                           uint64_t *posArray, uint64_t *dsink) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-
-  // initialize pointer-chasing array with just one thread
-  // warp up L2 cache and ensure all next accesses hit
-  if (tid == 0) {
-    for (uint32_t i = 0; i < (ARRAY_SIZE - 1); i++)
-      posArray[i] = (uint64_t)(posArray + i + 1);
-
-    posArray[ARRAY_SIZE - 1] = (uint64_t)posArray;
-  }
-
-  if (tid == 0) {
-
-    uint64_t *ptr = posArray + tid;
-    uint64_t ptr1, ptr0;
-
-    // initialize the pointers with the start address
-    // use cg modifier to cache the load in L2 and bypass L1
-    asm volatile("{\t\n"
-                 "ld.global.cg.u64 %0, [%1];\n\t"
-                 "}"
-                 : "=l"(ptr1)
-                 : "l"(ptr)
-                 : "memory");
-
-    // synchronize all threads
-    asm volatile("bar.sync 0;");
-
-    // start timing
-    uint32_t start = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-    // pointer-chasing ITERS times
-    // use cg modifier to cache the load in L2 and bypass L1
-    for (uint32_t i = 0; i < ITERS; ++i) {
-      asm volatile("{\t\n"
-                   "ld.global.cg.u64 %0, [%1];\n\t"
-                   "}"
-                   : "=l"(ptr0)
-                   : "l"((uint64_t *)ptr1)
-                   : "memory");
-      ptr1 = ptr0; // swap the register for the next load
-    }
-
-    // stop timing
-    uint32_t stop = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-    // write time and data back to memory
-    startClk[tid] = start;
-    stopClk[tid] = stop;
-    dsink[tid] = ptr1;
-  }
-}
-
-int l2_hit_lat() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  THREADS_PER_BLOCK = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  // Array size must not exceed L2 size
-  assert(ARRAY_SIZE * sizeof(uint64_t) < L2_SIZE);
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *posArray_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t)));
-
-  l2_hit_lat<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, posArray_g,
-                                       dsink_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-
-  float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
-  printf("L2 Hit Latency = %12.4f cycles \n", lat);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return lat;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile
deleted file mode 100644
index 5e9ac9ebf..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = l2_write_policy.cu
-
-EXE = l2_write_policy
-
-NVCC_FLGAS = -Xptxas -dlcm=cg
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu
deleted file mode 100644
index 3514782e5..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-This microbenchmark detects L2 write policy
-check the nvprof or nvsight for received l2 reads and writes to detect the
-policy check the comments below for further details and also see our arvix
-paper: https://arxiv.org/pdf/1810.07269.pdf
-
-Compile this file using the following command to disable L1 cache:
-    nvcc -Xptxas -dlcm=cg l2_write_policy.cu
-
-to run the program with nvsight
- make nvsight ./l2_write_policy
-*/
-
-#include <assert.h>
-#include <cstdio>
-#include <iostream>
-#include <stdint.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-#define THREADS_NUM 1   // Launch only one thread
-#define ARRAY_SIZE 1024 // size of the array
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Device code
-
-/*
-check the nvprof or nvsight to see the L2/DRAM reads and write hits
-in the below mb, we have 6 reads and 4 writes
-
-1. Check the write allocation policy
- we have three policies: write no-allocate vs write-allocate fetch-on-write vs
-vs write-allocate sub-sector write?? if only two write hits (C[i] and A[i] at
-lines 3&4) ==> then write no-allocate, else if three write hits (C[i+1], C[i]
-and A[i] at lines 2&3&4) then it is write-allocate. if one read miss (A[i] at
-line1) and 5 reads hits ==> then fetch-on-write, as the miss at line1 will fetch
-the whole sector, and C[i] and line 3 is hit else if two read misses (A[i] at
-lines 1 and C[i] at line 3) ==> then sub-sector write with write bit-mask, the
-sector will be fetched on read miss, not write miss
-
-2. check if write-back or write-through
-check the DRAM writes, if four writes are received ==> then write-through
-if less than four writes ==> then write-back
-
-to run the program with nvsight:
- make nvsight ./l2_write_policy
-
- stats to look at:
-l2 reads: lts__t_sectors_srcunit_tex_op_read.sum
-l2 writes: lts__t_sectors_srcunit_tex_op_write.sum
-l2 read hits: lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum
-l2 write hits: lts__t_sectors_srcunit_tex_op_write_lookup_hit.
-
-The comments below shows a case of write-allocate with sub-sector mask as in
-Pascal, Volta, Turing and Ampere HW Results found: Pascal, Volta, Turing and
-Ampere: write allocate &  sub-sector write  + write-back
-*/
-
-__global__ void write_policy_mb(float *A, float *C) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i == 0) {
-    C[i] = A[i];        // write to C[i] is a miss (cache line is missing)
-    C[i + 1] = A[i];    // write to C[i+1] is a hit (cache line is found)
-    C[i] = C[i] + A[i]; // read of C[i] is a miss (entire sector is missing,
-                        // fetch it from memory)
-    A[i] =
-        C[i] + C[i + 1]; // read C[i] and C[i+1] are hits (entire sector exists)
-  }
-}
-
-//////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
-
-  // create 4KB buffers of A&C
-  assert(ARRAY_SIZE * sizeof(float) < L2_SIZE);
-
-  float *A = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *C = (float *)malloc(ARRAY_SIZE * sizeof(float));
-
-  float *A_g;
-  float *C_g;
-
-  gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float)));
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++)
-    A[i] = (float)i;
-
-  gpuErrchk(
-      cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-
-  write_policy_mb<<<1, THREADS_NUM>>>(A_g, C_g);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-
-  std::cout << "\nThis microbenchmark detects L2 write policy.\n";
-  std::cout << "check the nvprof or nvsight for received L2 reads and writes "
-               "to detect the policy.\n";
-  std::cout << "see the code comments for further details\n";
-  std::cout << "to run the program with nvsight: make nvsight ./2\n";
-  std::cout << "stats to look at: llts__t_sectors_srcunit_tex_op_read.sum & "
-               "lts__t_sectors_srcunit_tex_op_write.sum & "
-               "lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum & "
-               "lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum \n\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile
deleted file mode 100644
index 6a97c1634..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = mem_atom_size.cu
-
-EXE = mem_atom_size
-
-NVCC_FLGAS = -Xptxas -dlcm=cg
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu
deleted file mode 100644
index 69b2370e1..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
-This benchmark measures mem atom size
-check the nvprof or nvsight for received mem reads and writes
-for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf
-
-Compile this file using the following command to disable L1 cache:
-    nvcc -Xptxas -dlcm=cg mem_atom_size.cu
-
-run the program with nsight
- make nvsight ./mem_atom_size
-
-Result: many Nvidia HW generation since kepler has 32B mem_atom_size granularity
-However, It seems some pascal and volta GPUs have atom size of 64B larger than
-the L2 access grain (32B). We asked Nvidia about this weird behavior, and that's
-their reply: https://forums.developer.nvidia.com/t/pascal-l1-cache/49571/15
-*/
-
-#include <assert.h>
-#include <cstdio>
-#include <iostream>
-#include <stdint.h>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Device code
-__global__ void mem_stride_cons(const float *A, float *C, int stride)
-
-{
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  C[i * stride] = A[i * stride];
-}
-
-// Host code
-void mem_stride(int N, int threadsPerBlock, int stride) {
-  // Variables
-  float *h_A;
-  float *h_C;
-
-  float *d_A;
-  float *d_C;
-
-  size_t size = N * sizeof(float) * 32;
-
-  // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
-  h_C = (float *)malloc(size);
-
-  // fill array
-  for (uint32_t i = 0; i < N; i++)
-    h_A[i] = (float)i;
-
-  // Allocate vectors in device memory
-  gpuErrchk(cudaMalloc((void **)&d_A, size));
-  gpuErrchk(cudaMalloc((void **)&d_C, size));
-
-  // Copy vectors from host memory to device memory
-  gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
-
-  // Invoke kernel
-  int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock);
-
-  mem_stride_cons<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, stride);
-  gpuErrchk(cudaPeekAtLastError());
-
-  // Copy result from device memory to host memory
-  // h_C contains the result in host memory
-  gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
-
-  // Free device memory
-  if (d_A)
-    cudaFree(d_A);
-  if (d_C)
-    cudaFree(d_C);
-
-  // Free host memory
-  if (h_A)
-    free(h_A);
-  if (h_C)
-    free(h_C);
-}
-//////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  intilizeDeviceProp(0);
-
-  // make the array very large to avoid L2 cache resident
-  // run the threads with stride of 32 (128B) to avoid any coalescing
-  mem_stride((L2_SIZE / sizeof(float)) * 2, 256, 32);
-
-  std::cout << std::endl
-            << "This benchmark measures mem atom size granularity" << std::endl;
-
-  std::cout << "check the nvprof or nvsight for received mem reads and writes"
-            << std::endl;
-  std::cout << "to run the program with nsight: make nvsight ./l2_access_grain"
-            << std::endl;
-  std::cout
-      << "stats to look at: dram__sectors_read.sum & dram__sectors_write.sum & "
-         "dram__bytes_read.sum & dram__sectors_read.sum"
-      << std::endl
-      << std::endl;
-
-  std::cout
-      << "we launched " << (L2_SIZE / sizeof(float)) * 2
-      << " read memory reqs (1 req per thread) with a stride of 32 (128 bytes)"
-      << std::endl;
-  std::cout << "if the number of memory reads is the same as read reqs, then "
-               "mem atom size is 32B"
-            << std::endl;
-  std::cout << "if the number of memory reads is 2X issued read reqs, then mem "
-               "atom size is 64B, etc."
-            << std::endl
-            << std::endl;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile
deleted file mode 100644
index 9409ddff2..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = mem_bw.cu
-
-EXE = mem_bw
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
deleted file mode 100644
index 62da9f86b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-// This benchmark measures the maximum read bandwidth of GPU memory
-// Compile this file using the following command to disable L1 cache:
-//    nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt mem_bw.cu
-
-// This code have been tested on Volta V100 architecture
-// You can check the mem BW from the nvprof and nvsight
-// (dram_read_throughput+dram_write_throughput)
-
-// to run the program with nvsight
-// make nvsight ./mem_bw
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-/*
-Send as many as float4 read requests on the flight to increase DRAM row buffer
-locality and hit the max BW
-*/
-
-__global__ void mem_bw(float *A, float *B, float *C, float *D, float *E,
-                       float *F, uint32_t *startClk, uint32_t *stopClk,
-                       unsigned ARRAY_SIZE) {
-  // block and thread index
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  uint32_t start = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  for (int i = idx; i < ARRAY_SIZE / 4; i += blockDim.x * gridDim.x) {
-    float4 a1 = reinterpret_cast<float4 *>(A)[i];
-    float4 b1 = reinterpret_cast<float4 *>(B)[i];
-    float4 d1 = reinterpret_cast<float4 *>(D)[i];
-    float4 e1 = reinterpret_cast<float4 *>(E)[i];
-    float4 f1 = reinterpret_cast<float4 *>(F)[i];
-    float4 c1;
-
-    c1.x = a1.x + b1.x + d1.x + e1.x + f1.x;
-    c1.y = a1.y + b1.y + d1.y + e1.y + f1.y;
-    c1.z = a1.z + b1.z + d1.z + e1.z + f1.z;
-    c1.w = a1.w + b1.w + d1.w + e1.w + f1.w;
-
-    reinterpret_cast<float4 *>(C)[i] = c1;
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  uint32_t stop = 0;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // write time and data back to memory
-  startClk[idx] = start;
-  stopClk[idx] = stop;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  // Array size has to exceed L2 size to avoid L2 cache residence
-  unsigned ARRAY_SIZE = (L2_SIZE / sizeof(float)) * 2;
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  float *A = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *B = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *C = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *D = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *E = (float *)malloc(ARRAY_SIZE * sizeof(float));
-  float *F = (float *)malloc(ARRAY_SIZE * sizeof(float));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  float *A_g;
-  float *B_g;
-  float *C_g;
-  float *D_g;
-  float *E_g;
-  float *F_g;
-
-  for (uint32_t i = 0; i < ARRAY_SIZE; i++) {
-    A[i] = (float)i;
-    B[i] = (float)i;
-    D[i] = (float)i;
-    E[i] = (float)i;
-    F[i] = (float)i;
-  }
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&B_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&D_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&E_g, ARRAY_SIZE * sizeof(float)));
-  gpuErrchk(cudaMalloc(&F_g, ARRAY_SIZE * sizeof(float)));
-
-  gpuErrchk(
-      cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  gpuErrchk(
-      cudaMemcpy(B_g, B, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  gpuErrchk(
-      cudaMemcpy(D_g, D, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  gpuErrchk(
-      cudaMemcpy(E_g, E, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  gpuErrchk(
-      cudaMemcpy(F_g, F, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start);
-
-  mem_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(A_g, B_g, C_g, D_g, E_g, F_g,
-                                            startClk_g, stopClk_g, ARRAY_SIZE);
-  cudaEventRecord(stop);
-  cudaEventSynchronize(stop);
-
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(
-      cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-
-  float mem_bw;
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
-
-  unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types
-  float max_bw = (float)MEM_BITWIDTH * MEM_CLK_FREQUENCY * 2 / 1e3 / 8;
-  mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0]));
-  printf("Mem BW= %f (Byte/Clk)\n", mem_bw);
-  printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6);
-  printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw);
-  printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100);
-
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile
deleted file mode 100644
index 35aa3045f..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = mem_config.cu
-
-EXE = mem_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
deleted file mode 100644
index e74931098..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  char msg[256];
-  snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n",
-           static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
-  std::cout << msg;
-  std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f
-            << " Mhz\n";
-  std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n";
-  std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n";
-  std::cout << "Memory channels = "
-            << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL) << "\n";
-
-  if (ACCEL_SIM_MODE) {
-
-    std::cout << "\n//Accel_Sim config: \n";
-
-    std::cout << "-gpgpu_n_mem "
-              << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL)
-              << std::endl;
-
-    std::cout << "-gpgpu_n_mem_per_ctrlr "
-              << dram_model_mem_per_ctrlr[DRAM_MODEL] << std::endl;
-    std::cout << "-gpgpu_dram_buswidth " << dram_model_bus_width[DRAM_MODEL] / 8
-              << std::endl;
-    std::cout << "-gpgpu_dram_burst_length "
-              << dram_model_burst_length[DRAM_MODEL] << std::endl;
-    std::cout << "-dram_data_command_freq_ratio "
-              << dram_model_freq_ratio[DRAM_MODEL] << std::endl;
-
-    // timing
-    float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) /
-                            dram_model_freq_ratio[DRAM_MODEL];
-    if (DRAM_MODEL == dram_model::HBM) {
-      // use HBM timing
-      DDR_Timing timing = HBM_Timing_1000MHZ;
-      timing.scale_timing_for_new_freq(device_freq_MHZ);
-      std::cout << "-dram_dual_bus_interface 1" << std::endl;
-      std::cout << "-gpgpu_dram_timing_opt nbk=" << timing.nbk
-                << ":CCD=" << get_adjusted_CCD(DRAM_MODEL)
-                << ":RRD=" << timing.RRD << ":RCD=" << timing.RCD
-                << ":RAS=" << timing.RAS << ":RP=" << timing.RP
-                << ":RC=" << timing.RC << ":CL=" << timing.CL
-                << ":WL=" << timing.WL << ":CDLR=" << timing.CDLR
-                << ":WR=" << timing.WR << ":nbkgrp=" << timing.nbkgrp
-                << ":CCDL=" << timing.CCDL << ":RTPL=" << timing.RTPL
-                << std::endl;
-    } else {
-      // use GDDR timing
-      DDR_Timing timing = GDDR5_Timing_1800MHZ;
-      timing.scale_timing_for_new_freq(device_freq_MHZ);
-      std::cout << "-dram_dual_bus_interface 0" << std::endl;
-      std::cout << "-gpgpu_dram_timing_opt nbk=" << timing.nbk
-                << ":CCD=" << get_adjusted_CCD(DRAM_MODEL)
-                << ":RRD=" << timing.RRD << ":RCD=" << timing.RCD
-                << ":RAS=" << timing.RAS << ":RP=" << timing.RP
-                << ":RC=" << timing.RC << ":CL=" << timing.CL
-                << ":WL=" << timing.WL << ":CDLR=" << timing.CDLR
-                << ":WR=" << timing.WR << ":nbkgrp=" << timing.nbkgrp
-                << ":CCDL=" << timing.CCDL << ":RTPL=" << timing.RTPL
-                << std::endl;
-    }
-
-    // leave the adddress mapping for now as it is
-    // the number of banks in HBM and GDDR are 16 and atom size is 32B, so the
-    // mapping should be okay. TODO: make this to be varibale based on memory
-    // model and size std::cout<<"-gpgpu_mem_address_mask 1"<<std::endl;
-    // std::cout<<"-gpgpu_mem_addr_mapping
-    // dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS"<<std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/Makefile
deleted file mode 100644
index 18b1cef6a..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = mem_lat.cu
-
-EXE = mem_lat
-
-NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu
deleted file mode 100644
index b5351eeb9..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "../../l2_cache/l2_lat/l2_lat.h"
-#include "mem_lat.h"
-#include <iostream>
-
-int main() {
-
-  intilizeDeviceProp(0);
-
-  float lat_mem = mem_lat();
-
-  if (ACCEL_SIM_MODE) {
-    float lat2 = l2_hit_lat();
-
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-dram_latency " << (unsigned)(lat_mem - lat2) << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h
deleted file mode 100644
index ad7119abd..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// This code is a modification of L1 cache benchmark from
-//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking":
-// https://arxiv.org/pdf/1804.06826.pdf
-
-// This benchmark measures the latency of GPU memory
-
-// This code have been tested on Volta V100 architecture
-
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define THREADS_NUM                                                            \
-  4 // HERE, we launch four threads, to ensure that one request is equal to DRAM
-    // trascation, 4 thread * 8 bytes = 32 bytes (= min DRAM trascation)
-#define ITERS 32768 // 1MB of pointer chasing, ITERS*THREADS_NUM*8 bytes
-
-__global__ void mem_lat(uint32_t *startClk, uint32_t *stopClk,
-                        uint64_t *posArray, uint64_t *dsink,
-                        unsigned MEM_ARRAY_SIZE) {
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t uid = blockIdx.x * blockDim.x + tid;
-
-  // initialize pointer-chasing array
-  for (uint32_t i = uid; i < (MEM_ARRAY_SIZE - THREADS_NUM);
-       i += blockDim.x * gridDim.x)
-    posArray[i] = (uint64_t)(posArray + i + THREADS_NUM);
-
-  if (uid < THREADS_NUM) { // only THREADS_NUM has to be active here
-
-    // initialize the tail to reference to the head of the array
-    posArray[MEM_ARRAY_SIZE - (THREADS_NUM - tid)] = (uint64_t)posArray + tid;
-
-    uint64_t *ptr = posArray + tid;
-    uint64_t ptr1, ptr0;
-
-    // initialize the pointers with the start address
-    // Here, we use cache volatile modifier to ignore the L2 cache
-    asm volatile("{\t\n"
-                 "ld.global.cv.u64 %0, [%1];\n\t"
-                 "}"
-                 : "=l"(ptr1)
-                 : "l"(ptr)
-                 : "memory");
-
-    // synchronize all threads
-    asm volatile("bar.sync 0;");
-
-    uint32_t start = 0;
-    uint32_t stop = 0;
-
-    // start timing
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-    // pointer-chasing ITERS times
-    // Here, we use cache volatile modifier to ignore the L2 cache
-    for (uint32_t i = tid; i < ITERS - THREADS_NUM; i += THREADS_NUM) {
-      asm volatile("{\t\n"
-                   "ld.global.cv.u64 %0, [%1];\n\t"
-                   "}"
-                   : "=l"(ptr0)
-                   : "l"((uint64_t *)ptr1)
-                   : "memory");
-      ptr1 = ptr0; // swap the register for the next load
-    }
-
-    // stop timing
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-    // write time and data back to memory
-    startClk[tid] = start;
-    stopClk[tid] = stop;
-    dsink[tid] = ptr1;
-  }
-}
-
-float mem_lat() {
-  intilizeDeviceProp(0);
-
-  unsigned MEM_ARRAY_SIZE =
-      (L2_SIZE / sizeof(uint64_t)) *
-      2; // pointer-chasing array size in 64-bit. total array size is 7 MB which
-         // larger than L2 cache size (6 MB in Volta) to avoid l2 cache resident
-         // from the copy engine
-
-  uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *posArray_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, MEM_ARRAY_SIZE * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t)));
-
-  mem_lat<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, posArray_g,
-                                             dsink_g, MEM_ARRAY_SIZE);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  float lat = (float)(stopClk[0] - startClk[0]) / (float)(ITERS / THREADS_NUM);
-  printf("Mem latency = %12.4f cycles \n", lat);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  return lat;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile
deleted file mode 100644
index 33331b9ca..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = shared_bw.cu
-
-EXE = shared_bw
-
-NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu
deleted file mode 100644
index 203bf3aa8..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
-#define ITERS 4096
-
-__global__ void shared_bw(uint64_t *startClk, uint64_t *stopClk,
-                          uint32_t *dsink, uint32_t stride) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-  uint32_t n_threads = blockDim.x * gridDim.x;
-
-  // a register to avoid compiler optimization
-  // uint32_t sink0 = 0;
-  register uint32_t tmp = uid;
-
-  uint64_t start = 0;
-  uint64_t stop = 0;
-
-  __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory
-  // uint32_t s[SHARED_MEM_SIZE];
-  // one thread to initialize the pointer-chasing array
-  for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads)
-    s[i] = (i + stride) % SHARED_MEM_SIZE;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
-
-  // load data from shared memory
-  for (uint32_t i = 0; i < ITERS; ++i) {
-    tmp = s[tmp];
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
-
-  // sink0 = tmp;
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = tmp;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
-
-  uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-  uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-
-  uint64_t *startClk_g;
-  uint64_t *stopClk_g;
-  uint32_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t)));
-
-  shared_bw<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                      THREADS_PER_BLOCK);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-
-  double bw, BW;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  bw =
-      (double)(ITERS * TOTAL_THREADS * sizeof(uint32_t)) / ((double)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile
deleted file mode 100644
index af9272e42..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = shared_bw_64.cu
-
-EXE = shared_bw_64
-
-NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu
deleted file mode 100644
index 5d7dae05b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <algorithm>
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define SHARED_MEM_SIZE (32 * 1024 / 8) // 32KB
-#define ITERS (4096)
-
-__global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk,
-                          uint64_t *dsink, uint32_t stride) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-  uint32_t n_threads = blockDim.x * gridDim.x;
-
-  // a register to avoid compiler optimization
-  // uint32_t sink0 = 0;
-  register uint64_t tmp = uid;
-
-  uint32_t start = 0;
-  uint32_t stop = 0;
-
-  __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory
-  // uint32_t s[SHARED_MEM_SIZE];
-  // one thread to initialize the pointer-chasing array
-  for (uint64_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads)
-    s[i] = (i + stride) % SHARED_MEM_SIZE;
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // start timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-  // load data from shared memory
-  for (uint32_t i = 0; i < ITERS; ++i) {
-    tmp = s[tmp];
-  }
-
-  // synchronize all threads
-  asm volatile("bar.sync 0;");
-
-  // stop timing
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-  // sink0 = tmp;
-  // write time and data back to memory
-  startClk[uid] = start;
-  stopClk[uid] = stop;
-  dsink[uid] = tmp;
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
-
-  assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
-
-  uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t)));
-
-  shared_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
-                                               THREADS_PER_BLOCK);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-
-  double bw, BW;
-  uint64_t total_time =
-      *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
-      *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
-  bw =
-      (double)(ITERS * TOTAL_THREADS * sizeof(uint64_t)) / ((double)total_time);
-  BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
-  std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW
-            << "(GB/s/SM)\n";
-  std::cout << "Total Clk number = " << total_time << "\n";
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile
deleted file mode 100644
index 8e85df98b..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-SRC = shared_lat.cu
-
-EXE = shared_lat
-
-NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu
deleted file mode 100644
index 199392780..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define SHARED_MEM_SIZE (32 * 1024 / 8)
-// Launch only one thread to calcaulte the latency using a pointer-chasing
-// array technique
-#define THREADS_NUM 1
-// iterate over the array ITERS times
-#define ITERS 2048
-
-// Measure latency of ITERS reads.
-__global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
-                           uint64_t *dsink, uint32_t stride) {
-
-  // thread index
-  uint32_t tid = threadIdx.x;
-  uint32_t bid = blockIdx.x;
-  uint32_t uid = bid * blockDim.x + tid;
-  uint32_t n_threads = blockDim.x * gridDim.x;
-
-  __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory
-
-  // one thread to initialize the pointer-chasing array
-  for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads)
-    s[i] = (i + stride) % SHARED_MEM_SIZE;
-
-  if (uid == 0) {
-    // initalize pointer chaser
-    uint64_t p_chaser = 0;
-
-    // start timing
-    uint32_t start = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
-
-    // pointer-chasing ITERS times
-    for (uint32_t i = 0; i < ITERS; ++i) {
-      p_chaser = s[p_chaser];
-    }
-
-    // stop timing
-    uint32_t stop = 0;
-    asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
-
-    // write time and data back to memory
-    startClk[uid] = start;
-    stopClk[uid] = stop;
-    dsink[uid] = p_chaser;
-  }
-}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  BLOCKS_NUM = 1;
-  TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
-  THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
-
-  assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
-
-  uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint64_t)));
-
-  shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
-  gpuErrchk(cudaPeekAtLastError());
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(
-      cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
-  gpuErrchk(
-      cudaMemcpy(dsink, dsink_g, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-  float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
-  printf("Shared Memory Latency  = %f cycles\n", lat);
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
-
-  if (ACCEL_SIM_MODE) {
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile
deleted file mode 100644
index 82e862792..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = shd_config.cu
-
-EXE = shd_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu
deleted file mode 100644
index 2009e8bac..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  printf("Shared memory per multiprocessor = %lu bytes\n",
-         deviceProp.sharedMemPerMultiprocessor);
-
-  printf("Shared memory per block = %lu bytes\n", deviceProp.sharedMemPerBlock);
-
-  if (ACCEL_SIM_MODE) {
-
-    std::cout << "\n//Accel_Sim config: \n";
-
-    std::cout << "-gpgpu_shmem_size " << deviceProp.sharedMemPerMultiprocessor
-              << std::endl;
-    std::cout << "-gpgpu_shmem_sizeDefault "
-              << deviceProp.sharedMemPerMultiprocessor << std::endl;
-    std::cout << "-gpgpu_shmem_per_block " << deviceProp.sharedMemPerBlock
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
deleted file mode 100644
index 012ae48b7..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = deviceQuery.cpp
-
-EXE = deviceQuery
-
-NVCC_FLGAS =
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp b/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
deleted file mode 100644
index 4d8bb318a..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-Some of the code is adopted from device query benchmark
-from CUDA SDK
-*/
-
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-int main(int argc, char **argv) {
-  int deviceCount = 0;
-  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
-
-  if (error_id != cudaSuccess) {
-    printf("cudaGetDeviceCount returned %d\n-> %s\n",
-           static_cast<int>(error_id), cudaGetErrorString(error_id));
-    printf("Result = FAIL\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
-  }
-
-  int dev, driverVersion = 0, runtimeVersion = 0;
-
-  for (dev = 0; dev < deviceCount; ++dev) {
-    cudaSetDevice(dev);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
-
-    // device
-    printf("  Device : \"%s\"\n\n", deviceProp.name);
-    printf("  CUDA version number                         : %d.%d\n",
-           deviceProp.major, deviceProp.minor);
-
-    // core
-    printf("  GPU Max Clock rate                             : %.0f MHz \n",
-           deviceProp.clockRate * 1e-3f);
-    printf("  Multiprocessors Count                       : %d\n",
-           deviceProp.multiProcessorCount);
-    printf("  Maximum number of threads per multiprocessor: %d\n",
-           deviceProp.maxThreadsPerMultiProcessor);
-    printf("  CUDA Cores per multiprocessor               : %d \n",
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
-    printf("  Registers per multiprocessor                : %d\n",
-           deviceProp.regsPerMultiprocessor);
-    printf("  Shared memory per multiprocessor            : %lu bytes\n",
-           deviceProp.sharedMemPerMultiprocessor);
-    printf("  Warp size                                   : %d\n",
-           deviceProp.warpSize);
-
-    // threadblock config
-    printf("  Maximum number of threads per block         : %d\n",
-           deviceProp.maxThreadsPerBlock);
-    printf("  Shared memory per block                     : %lu bytes\n",
-           deviceProp.sharedMemPerBlock);
-    printf("  Registers per block                         : %d\n",
-           deviceProp.regsPerBlock);
-
-    // L1 cache
-    printf("  globalL1CacheSupported                      : %d\n",
-           deviceProp.globalL1CacheSupported);
-    printf("  localL1CacheSupported                       : %d\n",
-           deviceProp.localL1CacheSupported);
-
-    // L2 cache
-    if (deviceProp.l2CacheSize) {
-      printf("  L2 Cache Size                             : %.0f MB\n",
-             static_cast<float>(deviceProp.l2CacheSize / 1048576.0f));
-    }
-
-    // memory
-    char msg[256];
-    snprintf(msg, sizeof(msg),
-             "  Global memory size                        : %.0f GB\n",
-             static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
-    printf("%s", msg);
-    printf("  Memory Clock rate                           : %.0f Mhz\n",
-           deviceProp.memoryClockRate * 1e-3f);
-    printf("  Memory Bus Width                            : %d bit\n",
-           deviceProp.memoryBusWidth);
-
-    printf(" ////////////////////////// \n");
-  }
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile
deleted file mode 100644
index 204366319..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = kernel_lat.cu
-
-EXE = kernel_lat
-
-NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu
deleted file mode 100644
index c538860e6..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu
+++ /dev/null
@@ -1,142 +0,0 @@
-
-// This benchmark measures the kernel overhead as linear function a + Xb where X
-// is the number of launched TBs, a is kernel launch latency and b is TB launch
-// latency
-
-#include <cuda.h>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "../../../hw_def/hw_def.h"
-
-#define THREADS_NUM 1024
-#define ARRAY_SIZE 4096
-
-__global__ void kernel_lat_1TB(uint32_t *startClk, uint32_t *stopClk,
-                               uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_2TB(uint32_t *startClk, uint32_t *stopClk,
-                               uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_4TB(uint32_t *startClk, uint32_t *stopClk,
-                               uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_8TB(uint32_t *startClk, uint32_t *stopClk,
-                               uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_16TB(uint32_t *startClk, uint32_t *stopClk,
-                                uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_32TB(uint32_t *startClk, uint32_t *stopClk,
-                                uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_64TB(uint32_t *startClk, uint32_t *stopClk,
-                                uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_128TB(uint32_t *startClk, uint32_t *stopClk,
-                                 uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_256TB(uint32_t *startClk, uint32_t *stopClk,
-                                 uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_512TB(uint32_t *startClk, uint32_t *stopClk,
-                                 uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_1024TB(uint32_t *startClk, uint32_t *stopClk,
-                                  uint64_t *posArray, uint64_t *dsink) {}
-
-__global__ void kernel_lat_2048TB(uint32_t *startClk, uint32_t *stopClk,
-                                  uint64_t *posArray, uint64_t *dsink) {}
-
-int main() {
-  intilizeDeviceProp(0);
-
-  uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t));
-  uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t));
-
-  uint32_t *startClk_g;
-  uint32_t *stopClk_g;
-  uint64_t *posArray_g;
-  uint64_t *dsink_g;
-
-  gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t)));
-  gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t)));
-  gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t)));
-
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start);
-
-  kernel_lat_1TB<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-                                     dsink_g);
-
-  gpuErrchk(cudaPeekAtLastError());
-  cudaEventRecord(stop);
-  cudaEventSynchronize(stop);
-
-  /*
-  kernel_lat_2TB<<<2,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g);
-  gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_4TB<<<4,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g);
-  gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_8TB<<<8,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g);
-  gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_16TB<<<16,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_32TB<<<32,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-
-  kernel_lat_64TB<<<64,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_128TB<<<128,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-
-  kernel_lat_256TB<<<256,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_512TB<<<1024,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_1024TB<<<1024,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-  kernel_lat_2048TB<<<2048,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g,
-  dsink_g); gpuErrchk( cudaPeekAtLastError() );
-
-*/
-
-  gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t),
-                       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t),
-                       cudaMemcpyDeviceToHost));
-
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
-
-  float lat = (milliseconds * 1000 * CLK_FREQUENCY) / 3;
-  std::cout << "Kernel Launch Latency = " << lat << " cycles\n";
-  std::cout << "The reported latency above can be slightly higher than real. "
-               "For accurate evaultion using nvprof event, exmaple: make "
-               "events ./kernel_lat\n";
-
-  if (ACCEL_SIM_MODE) {
-    std::cout << "\n//Accel_Sim config: \n";
-    std::cout << "-gpgpu_kernel_launch_latency  " << (unsigned)(lat)
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile
deleted file mode 100644
index a0e8ce8ff..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SRC = list_devices.cpp
-
-EXE = list_devices
-
-NVCC_FLGAS =
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp b/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp
deleted file mode 100644
index be29463e4..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-Some of the code is adopted from device query benchmark
-from CUDA SDK
-*/
-
-// std::system includes
-
-#include <cuda_runtime.h>
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-
-  int deviceCount = 0;
-  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
-
-  if (error_id != cudaSuccess) {
-    printf("cudaGetDeviceCount returned %d\n-> %s\n",
-           static_cast<int>(error_id), cudaGetErrorString(error_id));
-    printf("Result = FAIL\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
-  }
-
-  for (int dev = 0; dev < deviceCount; ++dev) {
-    cudaSetDevice(dev);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
-
-    printf("\nDevice %d: \"%s sm_%d.%d\"\n", dev, deviceProp.name,
-           deviceProp.major, deviceProp.minor);
-  }
-}
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile
deleted file mode 100644
index e3a5c7a7f..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-SRC = system_config.cu
-
-EXE = system_config
-
-include ../../../common/common.mk
diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu b/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu
deleted file mode 100644
index 152133edc..000000000
--- a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <iostream>
-using namespace std;
-
-#include "../../../hw_def/hw_def.h"
-
-int main() {
-  intilizeDeviceProp(0);
-
-  printf("Device Name = %s\n", deviceProp.name);
-  printf("GPU Max Clock rate = %.0f MHz \n", deviceProp.clockRate * 1e-3f);
-  printf("GPU Base Clock rate = %d MHz \n", CLK_FREQUENCY);
-  printf("SM Count = %d\n", deviceProp.multiProcessorCount);
-  printf("CUDA version number = %d.%d\n", deviceProp.major, deviceProp.minor);
-
-  if (ACCEL_SIM_MODE) {
-
-    std::cout << "\n//Accel_Sim config: \n";
-
-    float mem_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) /
-                         dram_model_freq_ratio[DRAM_MODEL];
-    std::cout << "-gpgpu_compute_capability_major " << deviceProp.major
-              << std::endl;
-    std::cout << "-gpgpu_compute_capability_minor " << deviceProp.minor
-              << std::endl;
-    std::cout << "-gpgpu_n_clusters " << deviceProp.multiProcessorCount
-              << std::endl;
-    std::cout << "-gpgpu_n_cores_per_cluster 1" << std::endl;
-    std::cout << "-gpgpu_clock_domains " << CLK_FREQUENCY << ":"
-              << CLK_FREQUENCY << ":" << CLK_FREQUENCY << ":" << mem_freq_MHZ
-              << std::endl;
-  }
-
-  return 1;
-}
diff --git a/util/tuner/README.md b/util/tuner/README.md
index 3f0ea34a6..a20da2474 100644
--- a/util/tuner/README.md
+++ b/util/tuner/README.md
@@ -17,21 +17,26 @@ cache hashing function), we do an extensive searching by simulating each possibl
 # Tuning Steps:
 The following steps demonstrate how to tune the Accel-Sim config files to a specific GPU hardware. We assume that you already have the GPU hardware in question.
 
+
+0. **Get Microbenchmarks**
+  ```bash
+    ./get_ubench.sh
+  ```
 1. **Provide HW def file and run microbenchmarks**:
-You need to provide a C header file `hw_def` that contains minimal information about the hardware model. This file is used to configure and tune the microbenchmarks for the unduerline hardware. See an example of Ampere RTX 3060 card [here](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h). These information can be gathered from Nvidia whitepaper and public website.
-After you write the HW file for the underline card, ensure to add it in [/GPU_Microbenchmark/hw_def/hw_def.h](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h).
+You need to provide a C header file `hw_def` that contains minimal information about the hardware model. This file is used to configure and tune the microbenchmarks for the unduerline hardware. See an example of Ampere RTX 3060 card [here](gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h). These information can be gathered from Nvidia whitepaper and public website.
+After you write the HW file for the underline card, ensure to add it in [/GPU_Microbenchmark/hw_def/hw_def.h](gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h).
 Then, compile microbenchmarks and run:
 
   ```bash
   # Make sure PATH includes nvcc
-  # If your hardware has new compute capability, ensure to add it in the /GPU_Microbenchmark/common/common.mk
+  # If your hardware has new compute capability, ensure to add it in the ./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark//common/common.mk
   # compile microbenchmarks
-  make -C ./GPU_Microbenchmark/
+  make -C ./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/
   # set the device id that you want to tune to
   # if you do not know the device id, run ./GPU_Microbenchmark/bin/list_devices
   export CUDA_VISIBLE_DEVICES=0
   #run the ubench and save output in stats.txt
-  ./GPU_Microbenchmark/run_all.sh | tee stats.txt
+  ./run_all.sh | tee stats.txt
   ```
 2. **Run the tuner**:
 The tuner.py script will parse the microbenchmarks output and generate a folder of the HW device name (e.g. "TITAN_V"). The folder will contain the config files for GPGPU-Sim performance model and Accel-Sim trace-driven front-end (gpgpusim.config and trace.config files)
diff --git a/util/tuner/get_ubench.sh b/util/tuner/get_ubench.sh
new file mode 100755
index 000000000..584879aef
--- /dev/null
+++ b/util/tuner/get_ubench.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Configuration
+REPO_URL="https://github.com/accel-sim/gpu-app-collection.git"
+CLONE_DIR="gpu-app-collection-partial"
+BRANCH="dev" 
+SPARSE_PATHS=(
+  "src/cuda/GPU_Microbenchmark"
+  "src/cuda/cuda-samples"
+)
+
+# Step 1: Clone repo with sparse checkout enabled
+git clone --recurse-submodules -j8 --filter=blob:none --no-checkout -b "$BRANCH" "$REPO_URL" "$CLONE_DIR"
+cd "$CLONE_DIR"
+
+# Step 2: Enable sparse checkout
+git sparse-checkout init --cone
+git sparse-checkout set "${SPARSE_PATHS[@]}"
+git checkout
+
+# Step 3: Manually initialize the submodule (if not already checked out)
+git submodule update --init --recursive -- src/cuda/cuda-samples
diff --git a/util/tuner/run_all.sh b/util/tuner/run_all.sh
new file mode 100755
index 000000000..93dcf16e0
--- /dev/null
+++ b/util/tuner/run_all.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
+SCRIPT_DIR="./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/"
+echo "Running make in $SCRIPT_DIR"
+make -C "$SCRIPT_DIR" tuner -j || { echo "make failed"; exit 1; }
+
+cd ${SCRIPT_DIR}/bin/
+for f in ./*; do
+    if [[ "$f" == *_corr ]]; then
+        continue
+    fi
+
+    echo "running $f microbenchmark"
+    $f
+    echo "/////////////////////////////////"
+done
\ No newline at end of file