diff --git a/util/tuner/GPU_Microbenchmark/.gitignore b/util/tuner/GPU_Microbenchmark/.gitignore deleted file mode 100644 index 8200ece07..000000000 --- a/util/tuner/GPU_Microbenchmark/.gitignore +++ /dev/null @@ -1,60 +0,0 @@ -bin/ -*.csv -ubench/atomics/Atomic_add_bw/atomic_add_bw -ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict -ubench/atomics/Atomic_add_lat/atomic_add_lat -ubench/core/MaxFlops_double/MaxFlops_double -ubench/core/MaxFlops_float/MaxFlops_float -ubench/core/MaxFlops_half/MaxFlops_half -ubench/core/MaxFlops_int32/MaxFlops_int32 -ubench/core/config_dpu/config_dpu -ubench/core/config_fpu/config_fpu -ubench/core/config_int/config_int -ubench/core/config_sfu/config_sfu -ubench/core/config_tensor/config_tensor -ubench/core/config_udp/config_udp -ubench/core/core_config/core_config -ubench/core/lat_double/lat_double -ubench/core/lat_float/lat_float -ubench/core/lat_half/lat_half -ubench/core/lat_int32/lat_int32 -ubench/core/regfile_bw/regfile_bw -ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt -ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt -ubench/core/tensor_bw_half/tensor_bw_half -ubench/core/tensor_lat_half/tensor_lat_half -ubench/l1_cache/l1_access_grain/l1_access_grain -ubench/l1_cache/l1_adaptive/l1_adaptive -ubench/l1_cache/l1_associativity/l1_associativity -ubench/l1_cache/l1_banks/l1_banks -ubench/l1_cache/l1_bw_128/l1_bw_128 -ubench/l1_cache/l1_bw_32f/l1_bw_32f -ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll -ubench/l1_cache/l1_bw_64f/l1_bw_64f -ubench/l1_cache/l1_bw_64v/l1_bw_64v -ubench/l1_cache/l1_config/l1_config -ubench/l1_cache/l1_lat/l1_lat -ubench/l1_cache/l1_mshr/l1_mshr -ubench/l1_cache/l1_sector/l1_sector -ubench/l1_cache/l1_shared_bw/l1_shared_bw -ubench/l1_cache/l1_write_policy/l1_write_policy -ubench/l2_cache/l2_access_grain/l2_access_grain -ubench/l2_cache/l2_bw_128/l2_bw_128 -ubench/l2_cache/l2_bw_32f/l2_bw_32f -ubench/l2_cache/l2_bw_64f/l2_bw_64f -ubench/l2_cache/l2_config/l2_config -ubench/l2_cache/l2_copy_engine/l2_copy_engine -ubench/l2_cache/l2_lat/l2_lat -ubench/l2_cache/l2_write_policy/l2_write_policy -ubench/mem/mem_atom_size/mem_atom_size -ubench/mem/mem_bw/mem_bw -ubench/mem/mem_config/mem_config -ubench/mem/mem_lat/mem_lat -ubench/shd/shared_bw/shared_bw -ubench/shd/shared_bw_64/shared_bw_64 -ubench/shd/shared_lat/shared_lat -ubench/shd/shd_config/shd_config -ubench/system/deviceQuery/deviceQuery -ubench/system/kernel_lat/kernel_lat -ubench/system/system_config/system_config -ubench/system/list_devices/list_devices diff --git a/util/tuner/GPU_Microbenchmark/Makefile b/util/tuner/GPU_Microbenchmark/Makefile deleted file mode 100755 index 5f901780d..000000000 --- a/util/tuner/GPU_Microbenchmark/Makefile +++ /dev/null @@ -1,22 +0,0 @@ - -BASE_DIR := $(shell pwd) -BIN_DIR := $(BASE_DIR)/bin -SUB_DIRS = $(wildcard ubench/*/*/) -SUB_DIRS_ALL = $(SUB_DIRS:%=all-%) -SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%) - -all: create_dir $(SUB_DIRS_ALL) - -clean: delete_dir $(SUB_DIRS_CLEAN) - -$(SUB_DIRS_ALL): - $(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%) - -$(SUB_DIRS_CLEAN): - $(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean - -create_dir: - mkdir -p $(BIN_DIR) - -delete_dir: - cd $(BIN_DIR); rm -f * diff --git a/util/tuner/GPU_Microbenchmark/README.md b/util/tuner/GPU_Microbenchmark/README.md deleted file mode 100644 index 52c39a2a3..000000000 --- a/util/tuner/GPU_Microbenchmark/README.md +++ /dev/null @@ -1 +0,0 @@ -# GPU_Microbenchmark diff --git a/util/tuner/GPU_Microbenchmark/common/common.mk b/util/tuner/GPU_Microbenchmark/common/common.mk deleted file mode 100644 index 6c90a3f67..000000000 --- a/util/tuner/GPU_Microbenchmark/common/common.mk +++ /dev/null @@ -1,48 +0,0 @@ -BASE_DIR := $(shell pwd) -BIN_DIR := $(BASE_DIR)/../../../bin/ - -GENCODE_SM30 ?= -gencode=arch=compute_30,code=\"sm_30,compute_30\" -GENCODE_SM35 ?= -gencode=arch=compute_35,code=\"sm_35,compute_35\" -GENCODE_SM50 ?= -gencode=arch=compute_50,code=\"sm_50,compute_50\" -GENCODE_SM60 ?= -gencode=arch=compute_60,code=\"sm_60,compute_60\" -GENCODE_SM62 ?= -gencode=arch=compute_62,code=\"sm_62,compute_62\" -GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\" -GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\" -GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\" -GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\" - -CUOPTS = $(GENCODE_ARCH) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM62) $(GENCODE_SM70) $(GENCODE_SM75) $(GENCODE_SM80) - -CC := nvcc - -# CUDA_PATH ?= /use/local/cuda-10.1/ -INCLUDE := $(GPUAPPS_ROOT)/src/cuda/cuda-samples/Common/ -LIB := - -release: - $(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart - cp $(EXE) $(BIN_DIR) - -clean: - rm -f *.o; rm -f $(EXE) - -run: - ./$(EXE) - -profile: - nvprof ./$(EXE) - -events: - nvprof --events elapsed_cycles_sm ./$(EXE) - -profileall: - nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE) - -nvsight: - nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv - -ptx: - cuobjdump -ptx ./$(EXE) tee ptx.txt - -sass: - cuobjdump -sass ./$(EXE) tee sass.txt diff --git a/util/tuner/GPU_Microbenchmark/format-code.sh b/util/tuner/GPU_Microbenchmark/format-code.sh deleted file mode 100755 index f06cc7629..000000000 --- a/util/tuner/GPU_Microbenchmark/format-code.sh +++ /dev/null @@ -1,8 +0,0 @@ -#! /bin/sh - -THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )" -clang-format -i ${THIS_DIR}/ubench/*/*/*.cu -clang-format -i ${THIS_DIR}/ubench/*/*/*.h -clang-format -i ${THIS_DIR}/ubench/*/*/*.cpp -clang-format -i ${THIS_DIR}/hw_def/*/*.h -clang-format -i ${THIS_DIR}/hw_def/common/*.h diff --git a/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h deleted file mode 100644 index fc95b6dc2..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h +++ /dev/null @@ -1,33 +0,0 @@ -// These are the configration parameters that can be found publicly -// Sources: -// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf -// https://en.wikipedia.org/wiki/GeForce_30_series -// https://en.wikipedia.org/wiki/CUDA - -#ifndef AMPERE_RTX3070_DEF_H -#define AMPERE_RTX3070_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (192 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1410 // frequency in MHz - -#define ISSUE_MODEL issue_model::single // single issue core or dual issue -#define CORE_MODEL core_model::subcore // subcore model or shared model -#define DRAM_MODEL dram_model::HBM // memory type -#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM - -// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation -// see slide 22 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define SASS_hmma_per_PTX_wmma 2 - -// These vars are almost constant between HW generation -// see slide 24 from Nvidia at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h deleted file mode 100644 index 133711416..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h +++ /dev/null @@ -1,33 +0,0 @@ -// These are the configration parameters that can be found publicly -// Sources: -// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf -// https://en.wikipedia.org/wiki/GeForce_30_series -// https://en.wikipedia.org/wiki/CUDA - -#ifndef AMPERE_RTX3070_DEF_H -#define AMPERE_RTX3070_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (128 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1132 // frequency in MHz - -#define ISSUE_MODEL issue_model::single // single issue core or dual issue -#define CORE_MODEL core_model::subcore // subcore model or shared model -#define DRAM_MODEL dram_model::GDDR6 // memory type -#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM - -// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation -// see slide 22 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define SASS_hmma_per_PTX_wmma 2 - -// These vars are almost constant between HW generation -// see slide 24 from Nvidia at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/common/common.h b/util/tuner/GPU_Microbenchmark/hw_def/common/common.h deleted file mode 100644 index bd07f5c6c..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/common/common.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef COMMON_H -#define COMMON_H - -#include -#include -#include -#include - -#define ACCEL_SIM_MODE 1 - -enum issue_model { single = 1, dual = 2 }; - -static const char *issue_model_str[] = {"none", "single", "dual"}; - -enum core_model { shared = 0, subcore = 1 }; - -static const char *core_model_str[] = {"none", "shared", "subcore"}; - -enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 }; - -// GPU error check -#define gpuErrchk(ans) \ - { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, - bool abort = true) { - if (code != cudaSuccess) { - fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, - line); - if (abort) - exit(code); - } -} - -// source: -// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 -unsigned round_up_2n(unsigned v) { - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - - return v; -} - -unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); } - -bool isPowerOfTwo(int n) { - if (n == 0) - return false; - - return (ceil(log2(n)) == floor(log2(n))); -} - -static const char *dram_model_str[] = {"none", "GDDR5", "GDDR5X", "GDDR6", - "HBM"}; -static const unsigned dram_model_bus_width[] = {0, 32, 32, 16, 128}; // in bits -static const unsigned dram_model_mem_per_ctrlr[] = {0, 1, 1, 1, 1}; -static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2}; -static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2}; -// atom size = -// dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length -unsigned get_atom_size_inByte(enum dram_model model) { - return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] * - dram_model_burst_length[model]; -} -// CCD = dram_model_burst_length/dram_model_freq_ratio -unsigned get_adjusted_CCD(enum dram_model model) { - assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0); - return dram_model_burst_length[model] / dram_model_freq_ratio[model]; -} - -unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { - unsigned channel_width = - dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model]; - assert(total_memory_width % channel_width == 0); - return total_memory_width / channel_width; -} - -// DDR timing struct -struct DDR_Timing { - unsigned freq; - unsigned nbk; - unsigned CCD; - unsigned RRD; - unsigned RCD; - unsigned RAS; - unsigned RP; - unsigned RC; - unsigned CL; - unsigned WL; - unsigned CDLR; - unsigned WR; - unsigned nbkgrp; - unsigned CCDL; - unsigned RTPL; - - DDR_Timing(unsigned mfreq, unsigned n_bk, unsigned tCCD, unsigned tRRD, - unsigned tRCD, unsigned tRAS, unsigned tRP, unsigned tRC, - unsigned tCL, unsigned tWL, unsigned tCDLR, unsigned tWR, - unsigned n_bkgrp, unsigned tCCDL, unsigned tRTPL) { - freq = mfreq; - nbk = n_bk; - CCD = tCCD; - RRD = tRRD; - RCD = tRCD; - RAS = tRAS; - RP = tRP; - RC = tRC; - CL = tCL; - WL = tWL; - CDLR = tCDLR; - WR = tWR; - nbkgrp = n_bkgrp; - CCDL = tCCDL; - RTPL = tRTPL; - } - - void scale_timing_for_new_freq(float newfreq) { - float freq_scale = freq / newfreq; - RRD = ceil(RRD / freq_scale); - RCD = ceil(RCD / freq_scale); - RAS = ceil(RAS / freq_scale); - RP = ceil(RP / freq_scale); - RC = ceil(RC / freq_scale); - CL = ceil(CL / freq_scale); - WL = ceil(WL / freq_scale); - CDLR = ceil(CDLR / freq_scale); - WR = ceil(WR / freq_scale); - CCDL = ceil(CCDL / freq_scale); - RTPL = ceil(RTPL / freq_scale); - } -}; - -// GDDR5 timing from hynix H5GQ1H24AFR -//-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: -// CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2" - -static const DDR_Timing GDDR5_Timing_1800MHZ(1800, 16, 2, 6, 12, 28, 12, 40, 12, - 4, 5, 12, 4, 3, 2); - -// HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 -// paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf) -// Timing for 1 GHZ: -//-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47: -// CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4" - -static const DDR_Timing HBM_Timing_1000MHZ(1000, 16, 1, 4, 14, 33, 14, 47, 14, - 2, 3, 12, 4, 2, 4); - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h b/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h deleted file mode 100644 index 56070e74f..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/common/deviceQuery.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef DEVICE_QUERY_H -#define DEVICE_QUERY_H - -#include - -unsigned SM_NUMBER; // number of SMs -unsigned WARP_SIZE; // max threads per warp -unsigned MAX_THREADS_PER_SM; // max threads / sm -unsigned MAX_SHARED_MEM_SIZE; // Max configerable shared memory size in bytes -unsigned MAX_WARPS_PER_SM; // max warps / sm -unsigned MAX_REG_PER_SM; // max warps / sm - -unsigned MAX_THREAD_BLOCK_SIZE; // max threads per threadblock -unsigned MAX_SHARED_MEM_SIZE_PER_BLOCK; // Max configerable shared memory size - // per block in bytes -unsigned - MAX_REG_PER_BLOCK; // Max configerable shared memory size per block in bytes - -size_t L2_SIZE; // L2 size in bytes - -size_t MEM_SIZE; // Memory size in bytes -unsigned MEM_CLK_FREQUENCY; // Memory clock freq in MHZ -unsigned MEM_BITWIDTH; // Memory bit width - -// launched threadblocks -unsigned THREADS_PER_BLOCK; -unsigned BLOCKS_PER_SM; -unsigned THREADS_PER_SM; -unsigned BLOCKS_NUM; -unsigned TOTAL_THREADS; - -cudaDeviceProp deviceProp; - -unsigned intilizeDeviceProp(unsigned deviceID) { - cudaSetDevice(deviceID); - cudaGetDeviceProperties(&deviceProp, deviceID); - - // core stats - SM_NUMBER = deviceProp.multiProcessorCount; - MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor; - MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor; - WARP_SIZE = deviceProp.warpSize; - MAX_WARPS_PER_SM = - deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize; - MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor; - - // threadblock stats - MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock; - MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock; - MAX_REG_PER_BLOCK = deviceProp.regsPerBlock; - - // launched thread blocks to ensure GPU is fully occupied as much as possible - THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock; - BLOCKS_PER_SM = - deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock; - THREADS_PER_SM = BLOCKS_PER_SM * THREADS_PER_BLOCK; - BLOCKS_NUM = BLOCKS_PER_SM * SM_NUMBER; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - // L2 cache - L2_SIZE = deviceProp.l2CacheSize; - - // memory - MEM_SIZE = deviceProp.totalGlobalMem; - MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f; - MEM_BITWIDTH = deviceProp.memoryBusWidth; - - return 1; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h deleted file mode 100644 index 1fd2087c0..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef HW_DEF_H -#define HW_DEF_H - -//#include "kepler_TITAN_hw_def.h" - -//#include "pascal_TITANX_hw_def.h" - -//#include "volta_QV100_hw_def.h" - -//#include "turing_RTX2060_hw_def.h" - -//#include "ampere_RTX3070_hw_def.h" - -// #include "volta_TITANV_hw_def.h" - -#include "ampere_A100_hw_def.h" - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h deleted file mode 100644 index 1f4c6212e..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/kepler_TITAN_hw_def.h +++ /dev/null @@ -1,25 +0,0 @@ -// Kepler TITAN HW def file -#ifndef KEPLER_TITAN_DEF_H -#define KEPLER_TITAN_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (64 * 1024) // Max L1 size in bytes, when enabled - -#define CLK_FREQUENCY 837 // frequency in MHz - -#define ISSUE_MODEL issue_model::dual -#define CORE_MODEL core_model::shared -#define DRAM_MODEL dram_model::GDDR5 - -#define WARP_SCHEDS_PER_SM 4 - -// no tensor cores in kepler -#define SASS_hmma_per_PTX_wmma 0 - -// These vars are almost constant between HW generations -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h deleted file mode 100644 index b68329a73..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/pascal_TITANX_hw_def.h +++ /dev/null @@ -1,24 +0,0 @@ -// Pascal ITIANX HW def file -#ifndef PASCAL_TITANX_DEF_H -#define PASCAL_TITANX_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (24 * 1024) // Max L1 size in bytes, when enabled - -#define CLK_FREQUENCY 1417 // frequency in MHz - -#define ISSUE_MODEL issue_model::dual -#define CORE_MODEL core_model::subcore -#define DRAM_MODEL dram_model::GDDR5X -#define WARP_SCHEDS_PER_SM 4 - -// no tensor cores in pascal -#define SASS_hmma_per_PTX_wmma 0 - -// These vars are almost constant between HW generations -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h deleted file mode 100644 index e7d728d15..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/turing_RTX2060_hw_def.h +++ /dev/null @@ -1,27 +0,0 @@ -// These are the configration parameters that can be found publicly sources -// Turing HW def file -#ifndef TURING_RTX2070_DEF_H -#define TURING_RTX2070_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (64 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1365 // frequency in MHz - -#define ISSUE_MODEL issue_model::single // single issue core or dual issue -#define CORE_MODEL core_model::subcore // subcore model or shared model -#define DRAM_MODEL dram_model::GDDR6 // memory type -#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM - -// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation -#define SASS_hmma_per_PTX_wmma 4 - -// These vars are almost constant between HW generation -// see slide 24 from Nvidia at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h deleted file mode 100644 index c307e3266..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/volta_QV100_hw_def.h +++ /dev/null @@ -1,32 +0,0 @@ -// These are the configration parameters that can be found publicly -// Volta QV100 HW def file (sm_70) -// Data source: -// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf - -#ifndef VOLTA_QV100_HW_DEF_H -#define VOLTA_QV100_HW_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (128 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1132 // frequency in MHz - -#define ISSUE_MODEL issue_model::single -#define CORE_MODEL core_model::subcore -#define DRAM_MODEL dram_model::HBM -#define WARP_SCHEDS_PER_SM 4 - -// see slide 22 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -// number of SASS HMMA per 16x16 PTX WMMA for FP16 operands - FP32 accumlate operation -#define SASS_hmma_per_PTX_wmma 16 - -// These vars are almost constant between HW generation -// see slide 24 from Nvidia at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h deleted file mode 100644 index 3548d66e1..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/volta_TITANV_hw_def.h +++ /dev/null @@ -1,32 +0,0 @@ -// These are the configration parameters that can be found publicly -// Volta TITANV HW def file (sm_70) -// Data source: -// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf - -#ifndef VOLTA_TITANVV100_HW_DEF_H -#define VOLTA_TITANV_HW_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (128 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1200 // frequency in MHz - -#define ISSUE_MODEL issue_model::single -#define CORE_MODEL core_model::subcore -#define DRAM_MODEL dram_model::HBM -#define WARP_SCHEDS_PER_SM 4 - -// see slide 22 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf - // number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation -#define SASS_hmma_per_PTX_wmma 16 - -// These vars are almost constant between HW generation -// see slide 24 from Nvidia at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h b/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h deleted file mode 100644 index d25b24efa..000000000 --- a/util/tuner/GPU_Microbenchmark/hw_def/volta_V100_hw_def.h +++ /dev/null @@ -1,33 +0,0 @@ -// These are the configration parameters that can be found publicly -// Volta QV100 HW def file (sm_70) -// Data source: -// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf - -#ifndef VOLTA_V100_HW_DEF_H -#define VOLTA_V100_HW_DEF_H - -#include "./common/common.h" -#include "./common/deviceQuery.h" - -#define L1_SIZE (128 * 1024) // Max L1 size in bytes - -#define CLK_FREQUENCY 1455 // frequency in MHz - -#define ISSUE_MODEL issue_model::single -#define CORE_MODEL core_model::subcore -#define DRAM_MODEL dram_model::HBM -#define WARP_SCHEDS_PER_SM 4 - -// see slide 22 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation -#define SASS_hmma_per_PTX_wmma 16 - -// These vars are almost constant between HW generations -// see slide 24 at -// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf -// each memory channel is supported by 2 L2 banks -#define L2_BANKS_PER_MEM_CHANNEL 2 -#define L2_BANK_WIDTH_in_BYTE 32 - -#endif diff --git a/util/tuner/GPU_Microbenchmark/output.file b/util/tuner/GPU_Microbenchmark/output.file deleted file mode 100644 index 0a78d971a..000000000 --- a/util/tuner/GPU_Microbenchmark/output.file +++ /dev/null @@ -1,509 +0,0 @@ -running ./L1asso.csv microbenchmark -///////////////////////////////// -running ./L1line.csv microbenchmark -///////////////////////////////// -running ./MSHR100_array1073741824_shmem12288_itr6.csv microbenchmark -///////////////////////////////// -running ./MaxFlops_double microbenchmark -DPU FLOP per SM = 63.930252 (flop/clk/SM) -Total Clk number = 524860 -///////////////////////////////// -running ./MaxFlops_float microbenchmark -FLOP per SM = 126.861778 (flop/clk/SM) -Total Clk number = 66124 -///////////////////////////////// -running ./MaxFlops_half microbenchmark -half FLOP per SM = 249.334442 (flop/clk/SM) -Total Clk number = 16822 -///////////////////////////////// -running ./MaxFlops_int32 microbenchmark -int32 FLOP per SM = 126.886719 (flop/clk/SM) -Total Clk number = 66111 -///////////////////////////////// -running ./atomic_add_bw microbenchmark -Atomic int32 bandwidth = 0.000026 (byte/clk) -Total Clk number = 408780625932820 -///////////////////////////////// -running ./atomic_add_bw_conflict microbenchmark -Atomic int32 bandwidth = 0.464460 (byte/clk) -Total Clk number = 1444878939 -///////////////////////////////// -running ./atomic_add_lat microbenchmark -Atomic int32 latency = 243.626953 (clk) -Total Clk number = 249474 -///////////////////////////////// -running ./config_dpu microbenchmark -DPU FLOP per SM = 63.925381 (flop/clk/SM) -Total Clk number = 524900 -double-precision DPU latency = 8.064270 (clk) -Total Clk number = 132125 - -//Accel_Sim config: --gpgpu_num_dp_units 4 --ptx_opcode_latency_dp 8,8,8,8,330 --ptx_opcode_initiation_dp 4,4,4,4,130 --trace_opcode_latency_initiation_dp 8,4 -///////////////////////////////// -running ./config_fpu microbenchmark -FLOP per SM = 126.865616 (flop/clk/SM) -Total Clk number = 66122 -float-precision FPU latency = 4.119690 (clk) -Total Clk number = 67497 - -//Accel_Sim config: --gpgpu_num_sp_units 4 --ptx_opcode_latency_fp 4,4,4,4,39 --ptx_opcode_initiation_fp 2,2,2,2,4 --trace_opcode_latency_initiation_sp 4,2 -///////////////////////////////// -running ./config_int microbenchmark -int32 FLOP per SM = 126.886719 (flop/clk/SM) -Total Clk number = 66111 -int32 latency = 4.313965 (clk) -Total Clk number = 17670 - -//Accel_Sim config: --gpgpu_num_int_units 4 --ptx_opcode_latency_int 4,4,4,4,21 --ptx_opcode_initiation_int 2,2,2,2,2 --trace_opcode_latency_initiation_int 4,2 -///////////////////////////////// -running ./config_sfu microbenchmark -SFU fast sqrt bw = 15.9759(flops/clk/SM) -Total Clk number = 262539 -SFU fast sqrt latency = 21.1096(clk) -Total Clk number = 86465 - -//Accel_Sim config: --gpgpu_num_sfu_units 4 --ptx_opcode_latency_sfu 21 --ptx_opcode_initiation_sfu 8 --trace_opcode_latency_initiation_sfu 21,8 -///////////////////////////////// -running ./config_tensor microbenchmark -wmma PTX issue bandwidth = 3.73122(thread/clk/SM) -hmma SASS issue bandwidth = 59.6994(thread/clk/SM) -FMA tensor bandwidth = 477.596(FMA/clk/SM) -Total Clk number = 562056 -wmma latency = 35.3401(clk) -hmma latency = 2.20876(clk) -Total Clk number = 144753 - -//Accel_Sim config: --gpgpu_tensor_core_avail 1 --gpgpu_num_tensor_core_units 4 --ptx_opcode_latency_tesnor 35 --ptx_opcode_initiation_tensor 32 --trace_opcode_latency_initiation_tensor 2,2 --specialized_unit_3 1,4,2,4,4,TENSOR --trace_opcode_latency_initiation_spec_op_3 2,2 -///////////////////////////////// -running ./config_udp microbenchmark --specialized_unit_4 1,4,4,4,4,UDP --trace_opcode_latency_initiation_spec_op_4 4,1 -///////////////////////////////// -running ./core_config microbenchmark -CUDA version number = 7.0 - -//Accel_Sim config: --gpgpu_ptx_force_max_capability 70 --gpgpu_shader_registers 65536 --gpgpu_registers_per_block 65536 --gpgpu_occupancy_sm_number 70 --gpgpu_coalesce_arch 70 --gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 --gpgpu_sub_core_model 1 --gpgpu_enable_specialized_operand_collector 0 --gpgpu_operand_collector_num_units_gen 8 --gpgpu_operand_collector_num_in_ports_gen 8 --gpgpu_operand_collector_num_out_ports_gen 8 --gpgpu_num_sched_per_core 4 --gpgpu_max_insn_issue_per_warp 1 --gpgpu_dual_issue_diff_exec_units 1 --gpgpu_inst_fetch_throughput 4 --gpgpu_shader_core_pipeline 2048:32 --gpgpu_shader_cta 32 -///////////////////////////////// -running ./data.csv microbenchmark -///////////////////////////////// -running ./deviceQuery microbenchmark - Device : "TITAN V" - - CUDA version number : 7.0 - GPU Max Clock rate : 1455 MHz - Multiprocessors Count : 80 - Maximum number of threads per multiprocessor: 2048 - CUDA Cores per multiprocessor : 64 - Registers per multiprocessor : 65536 - Shared memory per multiprocessor : 98304 bytes - Warp size : 32 - Maximum number of threads per block : 1024 - Shared memory per block : 49152 bytes - Registers per block : 65536 - globalL1CacheSupported : 1 - localL1CacheSupported : 1 - L2 Cache Size : 4 MB - Global memory size : 12 GB - Memory Clock rate : 850 Mhz - Memory Bus Width : 3072 bit - ////////////////////////// - Device : "GeForce RTX 2060" - - CUDA version number : 7.5 - GPU Max Clock rate : 1710 MHz - Multiprocessors Count : 30 - Maximum number of threads per multiprocessor: 1024 - CUDA Cores per multiprocessor : 64 - Registers per multiprocessor : 65536 - Shared memory per multiprocessor : 65536 bytes - Warp size : 32 - Maximum number of threads per block : 1024 - Shared memory per block : 49152 bytes - Registers per block : 65536 - globalL1CacheSupported : 1 - localL1CacheSupported : 1 - L2 Cache Size : 3 MB - Global memory size : 6 GB - Memory Clock rate : 7001 Mhz - Memory Bus Width : 192 bit - ////////////////////////// - Device : "GeForce GTX TITAN X" - - CUDA version number : 5.2 - GPU Max Clock rate : 1076 MHz - Multiprocessors Count : 24 - Maximum number of threads per multiprocessor: 2048 - CUDA Cores per multiprocessor : 128 - Registers per multiprocessor : 65536 - Shared memory per multiprocessor : 98304 bytes - Warp size : 32 - Maximum number of threads per block : 1024 - Shared memory per block : 49152 bytes - Registers per block : 65536 - globalL1CacheSupported : 1 - localL1CacheSupported : 1 - L2 Cache Size : 3 MB - Global memory size : 12 GB - Memory Clock rate : 3505 Mhz - Memory Bus Width : 384 bit - ////////////////////////// - Device : "Quadro P2200" - - CUDA version number : 6.1 - GPU Max Clock rate : 1493 MHz - Multiprocessors Count : 10 - Maximum number of threads per multiprocessor: 2048 - CUDA Cores per multiprocessor : 128 - Registers per multiprocessor : 65536 - Shared memory per multiprocessor : 98304 bytes - Warp size : 32 - Maximum number of threads per block : 1024 - Shared memory per block : 49152 bytes - Registers per block : 65536 - globalL1CacheSupported : 1 - localL1CacheSupported : 1 - L2 Cache Size : 1 MB - Global memory size : 5 GB - Memory Clock rate : 5005 Mhz - Memory Bus Width : 160 bit - ////////////////////////// -///////////////////////////////// -running ./kernel_lat microbenchmark -Kernel Launch Latency = 7257.6 cycles -The reported latency above can be slightly higher than real. For accurate evaultion using nvprof event, exmaple: make events ./kernel_lat - -//Accel_Sim config: --gpgpu_kernel_launch_latency 7257 -///////////////////////////////// -running ./l1_access_grain microbenchmark - -This benchmark measures coalescing granularity for differnet strides. -check the nvprof or nvsight for received l1 reads and writes. -to run the program with nsight: make nvsight ./l1_access_grain -stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum - -///////////////////////////////// -running ./l1_adaptive microbenchmark -The ubench is not imepleneted yet. -///////////////////////////////// -running ./l1_associativity microbenchmark -Launching L1 cache line size ubench -Saving L1 cache line size data at L1line.csv -Launching L1 cache assoc ubench -Saving L1 cache assoc data at L1asso.csv -///////////////////////////////// -running ./l1_banks microbenchmark -The ubench is not imepleneted yet. -///////////////////////////////// -running ./l1_bw_128 microbenchmark -L1 bandwidth = 116.437(byte/clk/SM), 130.129(GB/s/SM) -Total Clk number = 36022 -///////////////////////////////// -running ./l1_bw_32f microbenchmark -L1 bandwidth = 78.3484(byte/clk/SM), 87.5612(GB/s/SM) -Total Clk number = 53534 -///////////////////////////////// -running ./l1_bw_32f_unroll microbenchmark -L1 bandwidth = 54.837540 (byte/clk/SM) -Total Clk number = 76486 -///////////////////////////////// -running ./l1_bw_64f microbenchmark -L1 bandwidth = 122.759(byte/clk/SM), 137.194(GB/s/SM) -Total Clk number = 34167 -///////////////////////////////// -running ./l1_bw_64v microbenchmark -L1 bandwidth = 113.883(byte/clk/SM), 127.274(GB/s/SM) -Total Clk number = 18415 -///////////////////////////////// -running ./l1_config microbenchmark - -//Accel_Sim config: --gpgpu_adaptive_cache_config 1 --gpgpu_l1_banks 4 --gpgpu_cache:dl1 S:4,128,64,L:L:m:N:L,A:512:64,16:0,32 --gpgpu_gmem_skip_L1D 0 -///////////////////////////////// -running ./l1_lat microbenchmark -L1 Latency = 33.7331 cycles -Total Clk number = 1105365 - -//Accel_Sim config: --gpgpu_l1_latency = 33 -///////////////////////////////// -running ./l1_mshr microbenchmark -Launching L1 MSHR ubench -Saving L1 MSHR data at MSHR100_array1073741824_shmem12288_itr6.csv -///////////////////////////////// -running ./l1_sector microbenchmark -Launching L1 sector ubench -Saving L1 sector data at data.csv -///////////////////////////////// -running ./l1_shared_bw microbenchmark -Shared Memory Bandwidth = 99.708586 (byte/clk/SM) -Total Clk number = 336525 -///////////////////////////////// -running ./l1_write_policy microbenchmark - -This microbenchmark detects L1 write policy. -check the nvprof or nvsight for received l1 reads and writes to detect the policy. -see the code comments for further details -to run the program with nvsight: make nvsight ./l1_write_policy -stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum & l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum - -///////////////////////////////// -running ./l2_access_grain microbenchmark - -This benchmark measures l2 access granularity for differnet strides. -check the nvprof or nvsight for received l2 reads and write. -to run the program with nsight: make nvsight ./l2_access_grain -stats to look at: lts__t_sectors_srcunit_tex_op_read.sum and lts__t_sectors_srcunit_tex_op_write.sum - -///////////////////////////////// -running ./l2_bw_128 microbenchmark -L2 bandwidth = 1365.73(byte/clk), 1526.33(GB/s) -Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s) -L2 BW achievable = 88.9149% -Total Clk number = 491376 -///////////////////////////////// -running ./l2_bw_32f microbenchmark -L2 bandwidth = 1365.42(byte/clk), 1525.97(GB/s) -Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s) -L2 BW achievable = 88.8942% -Total Clk number = 982981 -///////////////////////////////// -running ./l2_bw_64f microbenchmark -L2 bandwidth = 1384.53(byte/clk), 1547.33(GB/s) -Max Theortical L2 bandwidth = 1536(byte/clk), 1716.61(GB/s) -L2 BW achievable = 90.1385% -Total Clk number = 1938823 -///////////////////////////////// -running ./l2_config microbenchmark -L2 Cache Size = 4 MB -L2 Banks number = 48 - -//Accel_Sim config: --gpgpu_n_sub_partition_per_mchannel 2 --icnt_flit_size 40 --gpgpu_memory_partition_indexing 0 --gpgpu_cache:dl2 S:32,128,24,L:B:m:L:P,A:192:4,32:0,32 -///////////////////////////////// -running ./l2_copy_engine microbenchmark -L2 Latency no-warmp up = 213.6997 cycles -Total Clk number = 7002512 -L2 Hit Latency = 220.1863 cycles -Total Clk number = 7215066 -Is memcpy cached in L2? Yes, error=2.9 - -//Accel_Sim config: --gpgpu_perf_sim_memcpy 1 -///////////////////////////////// -running ./l2_lat microbenchmark -L2 Hit Latency = 211.0720 cycles -Total Clk number = 6916406 -L1 Latency = 33.7729 cycles -Total Clk number = 1106672 - -//Accel_Sim config: --gpgpu_l2_rop_latency 177 -///////////////////////////////// -running ./l2_write_policy microbenchmark - -This microbenchmark detects L1 write policy. -check the nvprof or nvsight for received l1 reads and writes to detect the policy. -see the code comments for further details -to run the program with nvsight: make nvsight ./l1_write_policy -stats to look at: llts__t_sectors_srcunit_tex_op_read.sum & lts__t_sectors_srcunit_tex_op_write.sum & lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum & lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum - -///////////////////////////////// -running ./lat_double microbenchmark -double-precision DPU latency = 8.075317 (clk) -Total Clk number = 132306 -///////////////////////////////// -running ./lat_float microbenchmark -float-precision FPU latency = 4.128845 (clk) -Total Clk number = 67647 -///////////////////////////////// -running ./lat_half microbenchmark -fpu16 latency = 6.180664 (clk) -Total Clk number = 25316 -///////////////////////////////// -running ./lat_int32 microbenchmark -int32 latency = 4.349609 (clk) -Total Clk number = 17816 -///////////////////////////////// -running ./list_devices microbenchmark - -Device 0: "TITAN V sm_7.0" - -Device 1: "GeForce RTX 2060 sm_7.5" - -Device 2: "GeForce GTX TITAN X sm_5.2" - -Device 3: "Quadro P2200 sm_6.1" -///////////////////////////////// -running ./mem_atom_size microbenchmark - -This benchmark measures mem atom size granularity -check the nvprof or nvsight for received mem reads and writes -to run the program with nsight: make nvsight ./l2_access_grain -stats to look at: dram__sectors_read.sum & dram__sectors_write.sum & dram__bytes_read.sum & dram__sectors_read.sum - -we launched 2359296 read memory reqs (1 req per thread) with a stride of 32 (128 bytes) -if the number of memory reads is the same as read reqs, then mem atom size is 32B -if the number of memory reads is 2X issued read reqs, then mem atom size is 64B, etc. - -///////////////////////////////// -running ./mem_bw microbenchmark -Mem BW= 445.770477 (Byte/Clk) -Mem BW= 521.045920 (GB/sec) -Max Theortical Mem BW= 652.799988 (GB/sec) -Mem Efficiency = 68.285919 % -Total Clk number = 127023 -///////////////////////////////// -running ./mem_config microbenchmark -Global memory size = 12 GB -Memory Clock rate = 850 Mhz -Memory Bus Width = 3072 bit -Memory type = HBM -Memory channels = 24 - -//Accel_Sim config: --gpgpu_n_mem 24 --gpgpu_n_mem_per_ctrlr 1 --gpgpu_dram_buswidth 16 --gpgpu_dram_burst_length 2 --dram_data_command_freq_ratio 2 --dram_dual_bus_interface 1 --gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4 -///////////////////////////////// -running ./mem_lat microbenchmark -Mem latency = 313.4630 cycles -Total Clk number = 2567889 -L2 Hit Latency = 209.9695 cycles -Total Clk number = 6880281 - -//Accel_Sim config: --dram_latency 104 -///////////////////////////////// -running ./regfile_bw microbenchmark -wmma PTX issue bandwidth = 3.73473(thread/clk/SM) -hmma SASS issue bandwidth = 59.7557(thread/clk/SM) -FMA tensor bandwidth = 478.046(FMA/clk/SM) -Total Clk number = 561527 - -regfile_bw = 2048 (byte/SM) - -//Accel_Sim config: --gpgpu_num_reg_banks 16 --gpgpu_reg_file_port_throughput 2 -///////////////////////////////// -running ./sfu_bw_fsqrt microbenchmark -SFU fast sqrt bw = 15.976(flops/clk/SM) -Total Clk number = 262538 -///////////////////////////////// -running ./sfu_lat_fsqrt microbenchmark -SFU fast sqrt latency = 21.1453(clk) -Total Clk number = 86611 -///////////////////////////////// -running ./shared_bw microbenchmark -Shared Memory Bandwidth = 126.48(byte/clk/SM), 141.353(GB/s/SM) -Total Clk number = 132647 -///////////////////////////////// -running ./shared_bw_64 microbenchmark -Shared Memory Bandwidth = 127.932(byte/clk/SM), 142.975(GB/s/SM) -Total Clk number = 262283 -///////////////////////////////// -running ./shared_lat microbenchmark -Shared Memory Latency = 27.010254 cycles -Total Clk number = 55317 - -//Accel_Sim config: --gpgpu_smem_latency 27 -///////////////////////////////// -running ./shd_config microbenchmark -Shared memory per multiprocessor = 98304 bytes -Shared memory per block = 49152 bytes - -//Accel_Sim config: --gpgpu_shmem_size 98304 --gpgpu_shmem_sizeDefault 98304 --gpgpu_shmem_per_block 49152 -///////////////////////////////// -running ./system_config microbenchmark -Device Name = TITAN V -GPU Max Clock rate = 1455 MHz -GPU Base Clock rate = 1200 MHz -SM Count : 80 -CUDA version number = 7.0 - -//Accel_Sim config: --gpgpu_compute_capability_major 7 --gpgpu_compute_capability_minor 0 --gpgpu_n_clusters 80 --gpgpu_n_cores_per_cluster 1 --gpgpu_clock_domains 1200:1200:1200:850 -///////////////////////////////// -running ./tensor_bw_half microbenchmark -FP16 operand, FP32 accumalte: -wmma PTX issue bandwidth = 3.74006(thread/clk/SM) -hmma SASS issue bandwidth = 59.8409(thread/clk/SM) -FMA tensor bandwidth = 478.728(FMA/clk/SM) -Total Clk number = 560727 - -FP16 operand, FP16 accumalte: -wmma PTX issue bandwidth = 3.97989(thread/clk/SM) -hmma SASS issue bandwidth = 63.6783(thread/clk/SM) -FMA tensor bandwidth = 509.426(FMA/clk/SM) -Total Clk number = 526937 -///////////////////////////////// -running ./tensor_lat_half microbenchmark -FP16 operand, FP32 accumalte: -wmma latency = 35.3523(clk) -hmma latency = 2.20952(clk) -Total Clk number = 144803 - -FP16 operand, FP16 accumalte: -wmma latency = 33.0029(clk) -hmma latency = 2.06268(clk) -Total Clk number = 135180 -///////////////////////////////// diff --git a/util/tuner/GPU_Microbenchmark/run_all.sh b/util/tuner/GPU_Microbenchmark/run_all.sh deleted file mode 100755 index 40a924e5b..000000000 --- a/util/tuner/GPU_Microbenchmark/run_all.sh +++ /dev/null @@ -1,11 +0,0 @@ -#! /bin/sh - -THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )" -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" - -cd ${SCRIPT_DIR}/bin/ -for f in ./*; do - echo "running $f microbenchmark" - $f - echo "/////////////////////////////////" -done diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile deleted file mode 100644 index 032a57e18..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -SRC = atomic_add_bw.cu - -EXE = atomic_add_bw - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu deleted file mode 100644 index eb26a7e68..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 2048 - -template -__global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1, - T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - // register T s1 = data1[gid]; - // register T s2 = data2[gid]; - // register T result = 0; - // synchronize all threads - // int32_t res0, res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, - // res11, res12, res13, res14, res15; - int32_t sum; - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = clock64(); - - for (uint32_t i = 0; i < REPEAT_TIMES; i++) { - sum = sum + atomicAdd(&data1[(i * warpSize) + gid], 10); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = clock64(); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = sum; -} - -int main() { - - intilizeDeviceProp(0); - unsigned ARRAY_SIZE = TOTAL_THREADS + (REPEAT_TIMES * WARP_SIZE); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - - int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *data1 = (int32_t *)malloc(ARRAY_SIZE * sizeof(int32_t)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - int32_t *data1_g; - int32_t *res_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) { - data1[i] = (int32_t)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&data1_g, ARRAY_SIZE * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t))); - - gpuErrchk(cudaMemcpy(data1_g, data1, ARRAY_SIZE * sizeof(int32_t), - cudaMemcpyHostToDevice)); - - atomic_bw<<>>(startClk_g, stopClk_g, - data1_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyDeviceToHost)); - - float bw; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - // uint64_t total_time = stopClk[0]-startClk[0]; - - bw = (((float)REPEAT_TIMES * (float)TOTAL_THREADS * 4 * 8) / - (float)(total_time)); - printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); - printf("Total Clk number = %ld \n", total_time); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile deleted file mode 100644 index b85a3d827..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -SRC = atomic_add_bw_conflict.cu - -EXE = atomic_add_bw_conflict - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu deleted file mode 100644 index 193c16b9c..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu +++ /dev/null @@ -1,85 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -template -__global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - uint32_t sum; - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - sum = sum + atomicAdd(&data1[0], 10); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = sum; -} - -int main() { - intilizeDeviceProp(0); - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - int32_t *data1_g; - int32_t *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (int32_t)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyHostToDevice)); - - atomic_bw<<>>(startClk_g, stopClk_g, - data1_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyDeviceToHost)); - - float bw; - uint32_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - // uint32_t total_time = stopClk[0] - startClk[0]; - bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * 4) / (float)(total_time)); - printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); - printf("Total Clk number = %u \n", total_time); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile deleted file mode 100644 index 27e1f984f..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -SRC = atomic_add_lat.cu - -EXE = atomic_add_lat - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu deleted file mode 100644 index d31cbcb48..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu +++ /dev/null @@ -1,86 +0,0 @@ -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -template -__global__ void atmoic_latency(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - // register T s1 = data1[gid]; - // register T s2 = data2[gid]; - // register T result = 0; - uint32_t index = 0; - int32_t offset = 10; - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - for (int j = 0; j < REPEAT_TIMES; ++j) { - index = atomicAdd(&data1[index], offset); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = data1[0]; -} - -int main() { - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - int32_t *data1 = (int32_t *)malloc(REPEAT_TIMES * sizeof(int32_t)); - int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - int32_t *data1_g; - int32_t *res_g; - - int32_t stride = 1; - - for (int32_t i = 0; i < (REPEAT_TIMES); i++) - data1[i] = (i + stride) % REPEAT_TIMES; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, REPEAT_TIMES * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMemcpy(data1_g, data1, REPEAT_TIMES * sizeof(int32_t), - cudaMemcpyHostToDevice)); - - atmoic_latency<<<1, 1>>>(startClk_g, stopClk_g, data1_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES)); - printf("Atomic int32 latency = %f (clk)\n", latency); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile deleted file mode 100644 index eccd44b1b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = MaxFlops_double.cu - -EXE = MaxFlops_double - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu deleted file mode 100644 index 2b01f7cad..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu +++ /dev/null @@ -1,9 +0,0 @@ -#include "MaxFlops_double.h" - -int main() { - intilizeDeviceProp(0); - - dpu_max_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h deleted file mode 100644 index bcac5309a..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef MAXFLOPS_DOUBLE_DEF_H -#define MAXFLOPS_DOUBLE_DEF_H - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 4096 - -template -__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "}" - : "+d"(result), "+d"(s1), "+d"(s2)); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float dpu_max_flops() { - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - double *data1 = (double *)malloc(TOTAL_THREADS * sizeof(double)); - double *data2 = (double *)malloc(TOTAL_THREADS * sizeof(double)); - double *res = (double *)malloc(TOTAL_THREADS * sizeof(double)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - double *data1_g; - double *data2_g; - double *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (double)i; - data2[i] = (double)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(double))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(double))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(double))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(double), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(double), - cudaMemcpyHostToDevice)); - - max_flops<<>>(startClk_g, stopClk_g, - data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(double), - cudaMemcpyDeviceToHost)); - - float flops; - flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 8) / - ((float)(stopClk[0] - startClk[0])); - printf("DPU FLOP per SM = %f (flop/clk/SM)\n", flops); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return flops; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile deleted file mode 100644 index 36acf26ef..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = MaxFlops_float.cu - -EXE = MaxFlops_float - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu deleted file mode 100644 index b9482fc96..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu +++ /dev/null @@ -1,9 +0,0 @@ -#include "MaxFlops_float.h" - -int main() { - intilizeDeviceProp(0); - - fpu_max_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h deleted file mode 100644 index 88c1a82f2..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef MAXFLOPS_FLOAT_DEF_H -#define MAXFLOPS_FLOAT_DEF_H - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_ITERS 1024 - -template -__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_ITERS; ++j) { - asm volatile("{\t\n" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "}" - : "+f"(result), "+f"(s1), "+f"(s2)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -int fpu_max_flops() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *data2 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *res = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - float *data1_g; - float *data2_g; - float *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (float)i; - data2[i] = (float)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - - max_flops<<>>(startClk_g, stopClk_g, - data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float flops; - flops = (float)(REPEAT_ITERS * TOTAL_THREADS * 8) / - ((float)(stopClk[0] - startClk[0])); - printf("FLOP per SM = %f (flop/clk/SM)\n", flops); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return flops; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile deleted file mode 100644 index db878492c..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM50 := - -SRC = MaxFlops_half.cu - -EXE = MaxFlops_half - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu deleted file mode 100644 index 024d442df..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu +++ /dev/null @@ -1,9 +0,0 @@ -#include "MaxFlops_half.h" - -int main() { - intilizeDeviceProp(0); - - fpu16_max_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h deleted file mode 100644 index 15b2b803d..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef MAXFLOPS_FP16_DEF_H -#define MAXFLOPS_FP16_DEF_H - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -__global__ void fpu16_max_flops(uint32_t *startClk, uint32_t *stopClk, - half *data1, half *data2, half *data3, - half *data4, half *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - half s2 = data2[gid]; - half s4 = data4[gid]; - half2 mult = __halves2half2(s2, s4); - half result1 = data1[gid]; - half result2 = data3[gid]; - half2 result = __halves2half2(result1, result2); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - result = result * mult + result; - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = __high2half(result) + __low2half(result); -} - -float fpu16_max_flops() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - half *data1 = (half *)malloc(TOTAL_THREADS * sizeof(half)); - half *data2 = (half *)malloc(TOTAL_THREADS * sizeof(half)); - half *res = (half *)malloc(TOTAL_THREADS * sizeof(half)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - half *data1_g; - half *data2_g; - half *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (half)i; - data2[i] = (half)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(half))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(half))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(half))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(half), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(half), - cudaMemcpyHostToDevice)); - - fpu16_max_flops<<>>( - startClk_g, stopClk_g, data1_g, data2_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(half), - cudaMemcpyDeviceToHost)); - - float flops; - flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 4) / - ((float)(stopClk[0] - startClk[0])); - printf("half FLOP per SM = %f (flop/clk/SM)\n", flops); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return flops; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile deleted file mode 100644 index 63d6655f4..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = MaxFlops_int32.cu - -EXE = MaxFlops_int32 - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu deleted file mode 100644 index 01ab4e9b7..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "MaxFlops_int32.h" - -int main() { - - intilizeDeviceProp(0); - - max_int32_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h b/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h deleted file mode 100644 index 55b5aa585..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/MaxFlops_int32/MaxFlops_int32.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef MAXFLOPS_INT32_DEF_H -#define MAXFLOPS_INT32_DEF_H - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -template -__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "}" - : "+r"(result), "+r"(s1), "+r"(s2)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float max_int32_flops() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *data2 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - int32_t *data1_g; - int32_t *data2_g; - int32_t *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (int32_t)i; - data2[i] = (int32_t)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyHostToDevice)); - - max_flops<<>>( - startClk_g, stopClk_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyDeviceToHost)); - - float flops; - flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 8) / - ((float)(stopClk[0] - startClk[0])); - printf("int32 FLOP per SM = %f (flop/clk/SM)\n", flops); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return flops; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile deleted file mode 100644 index 2264d7d30..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = config_dpu.cu - -EXE = config_dpu - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu deleted file mode 100644 index ffdbe8f81..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu +++ /dev/null @@ -1,35 +0,0 @@ - -#include "../../../hw_def/hw_def.h" -#include "../MaxFlops_double/MaxFlops_double.h" -#include "../lat_double/lat_double.h" - -int main() { - intilizeDeviceProp(0); - - float flops = dpu_max_flops(); - float latency = dpu_latency(); - - if (ACCEL_SIM_MODE) { - unsigned lat = (unsigned)latency; - // divide flops by 2 as we need FMA throughput - unsigned throughput_per_SM = round_up_2n(flops / 2); - float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM; - - unsigned init = WARP_SIZE / throughput_per_sched; - - //init cannot be larger than latency - if(init > latency) - latency = init; - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_num_dp_units " << WARP_SCHEDS_PER_SM << std::endl; - std::cout << "-ptx_opcode_latency_dp " << lat << "," << lat << "," << lat - << "," << lat << ",330" << std::endl; - std::cout << "-ptx_opcode_initiation_dp " << init << "," << init << "," - << init << "," << init << ",130" << std::endl; - std::cout << "-trace_opcode_latency_initiation_dp " << lat << "," << init - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile deleted file mode 100644 index b7a99d190..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = config_fpu.cu - -EXE = config_fpu - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu deleted file mode 100644 index 44ed33b73..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu +++ /dev/null @@ -1,34 +0,0 @@ - -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" -#include "../MaxFlops_float/MaxFlops_float.h" -#include "../lat_float/lat_float.h" - -int main() { - intilizeDeviceProp(0); - - float flops = fpu_max_flops(); - float latency = fpu_latency(); - - if (ACCEL_SIM_MODE) { - unsigned lat = (unsigned)latency; - // divide flops by 2 as we need FMA thoughput - unsigned throughput_per_SM = round_up_2n(flops / 2); - float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM; - - unsigned init = WARP_SIZE / throughput_per_sched; - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_num_sp_units " << WARP_SCHEDS_PER_SM << std::endl; - std::cout << "-ptx_opcode_latency_fp " << lat << "," << lat << "," << lat - << "," << lat << ",39" << std::endl; - std::cout << "-ptx_opcode_initiation_fp " << init << "," << init << "," - << init << "," << init << "," << init * 2 << std::endl; - std::cout << "-trace_opcode_latency_initiation_sp " << lat << "," << init - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile deleted file mode 100644 index f8a024a41..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = config_int.cu - -EXE = config_int - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu deleted file mode 100644 index 4bb925872..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_int/config_int.cu +++ /dev/null @@ -1,38 +0,0 @@ - -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" -#include "../MaxFlops_int32/MaxFlops_int32.h" -#include "../lat_int32/lat_int32.h" - -int main() { - intilizeDeviceProp(0); - - float flops = max_int32_flops(); - float latency = int32_latency(); - - if (ACCEL_SIM_MODE) { - unsigned lat = (unsigned)latency; - // divide by 2 as we need FMA thoughput - unsigned throughput_per_SM = round_up_2n(flops / 2); - float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM; - - unsigned init = WARP_SIZE / throughput_per_sched; - - std::cout << "\n//Accel_Sim config: \n"; - if (deviceProp.major < 6) { // detecaited integer unit was added since Volta - std::cout << "-gpgpu_num_int_units 0" << std::endl; - } else { - std::cout << "-gpgpu_num_int_units " << WARP_SCHEDS_PER_SM << std::endl; - std::cout << "-ptx_opcode_latency_int " << lat << "," << lat << "," << lat - << "," << lat << ",21" << std::endl; - std::cout << "-ptx_opcode_initiation_int " << init << "," << init << "," - << init << "," << init << "," << init << std::endl; - std::cout << "-trace_opcode_latency_initiation_int " << lat << "," << init - << std::endl; - } - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile deleted file mode 100644 index c53bf55a1..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = config_sfu.cu - -EXE = config_sfu - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu deleted file mode 100644 index ac1b21fd1..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu +++ /dev/null @@ -1,28 +0,0 @@ - -#include "../../../hw_def/hw_def.h" -#include "../sfu_bw_fsqrt/sfu_bw_fsqrt.h" -#include "../sfu_lat_fsqrt/sfu_lat_fsqrt.h" - -int main() { - intilizeDeviceProp(0); - - float flops = sfu_max_flops(); - float latency = sfu_latency(); - - if (ACCEL_SIM_MODE) { - unsigned lat = (unsigned)latency; - unsigned throughput_per_SM = round_up_2n(flops); - float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM; - - unsigned init = WARP_SIZE / throughput_per_sched; - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_num_sfu_units " << WARP_SCHEDS_PER_SM << std::endl; - std::cout << "-ptx_opcode_latency_sfu " << lat << std::endl; - std::cout << "-ptx_opcode_initiation_sfu " << init << std::endl; - std::cout << "-trace_opcode_latency_initiation_sfu " << lat << "," << init - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile deleted file mode 100644 index d29e58ec8..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -GENCODE_SM50 := -GENCODE_SM61 := -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM60 := -GENCODE_SM62 := - -SRC = config_tensor.cu - -EXE = config_tensor - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu deleted file mode 100644 index 647221df5..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include "../tensor_bw_half/tensor_bw_half.h" -#include "../tensor_lat_half/tensor_lat_half.h" - -int main() { - intilizeDeviceProp(0); - - // measure the flops and lat based on half operand and float accumlate - float flops = tensor_max_flops(); - float latency = tensor_lat(); - - if (ACCEL_SIM_MODE) { - unsigned lat = (unsigned)latency; - unsigned throughput_per_SM = round_up_2n(flops); - float throughput_per_sched = (float)throughput_per_SM / WARP_SCHEDS_PER_SM; - - unsigned init = WARP_SIZE / throughput_per_sched; - - std::cout << "\n//Accel_Sim config: \n"; - if (deviceProp.major < 6) { // tensor core was added since Volta - std::cout << "-gpgpu_tensor_core_avail 0" << std::endl; - std::cout << "-gpgpu_num_tensor_core_units 0" << std::endl; - } else { - std::cout << "-gpgpu_tensor_core_avail 1" << std::endl; - std::cout << "-gpgpu_num_tensor_core_units " << WARP_SCHEDS_PER_SM - << std::endl; - std::cout << "-ptx_opcode_latency_tesnor " << lat << std::endl; - std::cout << "-ptx_opcode_initiation_tensor " << init << std::endl; - - // trace mode - // assume tesnor is on spec unit 3 - std::cout << "-trace_opcode_latency_initiation_tensor " - << lat / SASS_hmma_per_PTX_wmma << "," - << init / SASS_hmma_per_PTX_wmma << std::endl; - std::cout << "-specialized_unit_3 1," << WARP_SCHEDS_PER_SM << "," - << lat / SASS_hmma_per_PTX_wmma << "," << WARP_SCHEDS_PER_SM - << "," << WARP_SCHEDS_PER_SM << ",TENSOR" << std::endl; - std::cout << "-trace_opcode_latency_initiation_spec_op_3 " - << lat / SASS_hmma_per_PTX_wmma << "," - << init / SASS_hmma_per_PTX_wmma << std::endl; - } - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile deleted file mode 100644 index 484be447d..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = config_udp.cu - -EXE = config_udp - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu b/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu deleted file mode 100644 index 68b7a8162..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu +++ /dev/null @@ -1,29 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -int main() { - intilizeDeviceProp(0); - - if (ACCEL_SIM_MODE) { - - /* we cannot meaure uniform instrcution for now as they only exist at - SASS level not at PTX nor CUDA level, so assume constant latency and BW - for now - - dedicated uniform unit was added since Turing SM 7.0 - */ - if (deviceProp.major >= 7) { - // assume UDP unit is on spec unit 4 - std::cout << "-specialized_unit_4 1," << WARP_SCHEDS_PER_SM << ",4," - << WARP_SCHEDS_PER_SM << "," << WARP_SCHEDS_PER_SM << ",UDP" - << std::endl; - - std::cout << "-trace_opcode_latency_initiation_spec_op_4 4,1" - << std::endl; - } - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile deleted file mode 100644 index 8bf0f8857..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = core_config.cu - -EXE = core_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu b/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu deleted file mode 100644 index ea0eb047b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/core_config/core_config.cu +++ /dev/null @@ -1,84 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -int main() { - intilizeDeviceProp(0); - - printf("CUDA version number = %d.%d\n", deviceProp.major, deviceProp.minor); - - if (ACCEL_SIM_MODE) { - std::cout << "\n//Accel_Sim config: \n"; - - std::cout << "-gpgpu_ptx_force_max_capability " << deviceProp.major - << deviceProp.minor << std::endl; - - std::cout << "-gpgpu_shader_registers " << deviceProp.regsPerMultiprocessor - << std::endl; - std::cout << "-gpgpu_registers_per_block " << deviceProp.regsPerBlock - << std::endl; - std::cout << "-gpgpu_occupancy_sm_number " << deviceProp.major - << deviceProp.minor << std::endl; - std::cout << "-gpgpu_coalesce_arch " << deviceProp.major << deviceProp.minor - << std::endl; - - unsigned ID_OC_SP, ID_OC_DP, ID_OC_INT, ID_OC_SFU, ID_OC_MEM, OC_EX_SP, - OC_EX_DP, OC_EX_INT, OC_EX_SFU, OC_EX_MEM, EX_WB, ID_OC_TENSOR_CORE, - OC_EX_TENSOR_CORE; - ID_OC_SFU = OC_EX_SFU = WARP_SCHEDS_PER_SM; - ID_OC_MEM = OC_EX_MEM = WARP_SCHEDS_PER_SM; - ID_OC_SP = OC_EX_SP = WARP_SCHEDS_PER_SM; - ID_OC_DP = OC_EX_DP = WARP_SCHEDS_PER_SM; - EX_WB = WARP_SCHEDS_PER_SM * 2; - if (deviceProp.major < 6) { // no integer or tensor cores before volta - ID_OC_INT = OC_EX_INT = 0; - ID_OC_TENSOR_CORE = OC_EX_TENSOR_CORE = 0; - } else { - ID_OC_INT = OC_EX_INT = WARP_SCHEDS_PER_SM; - ID_OC_TENSOR_CORE = OC_EX_TENSOR_CORE = WARP_SCHEDS_PER_SM; - } - - //# - // ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE - std::cout << "-gpgpu_pipeline_widths " << ID_OC_SP << "," << ID_OC_DP << "," - << ID_OC_INT << "," << ID_OC_SFU << "," << ID_OC_MEM << "," - << OC_EX_SP << "," << OC_EX_DP << "," << OC_EX_INT << "," - << OC_EX_SFU << "," << OC_EX_MEM << "," << EX_WB; - if (deviceProp.major < 6) - std::cout << std::endl; - else - std::cout << "," << ID_OC_TENSOR_CORE << "," << OC_EX_TENSOR_CORE - << std::endl; - - std::cout << "-gpgpu_sub_core_model " << CORE_MODEL << std::endl; - - std::cout << "-gpgpu_enable_specialized_operand_collector 0" << std::endl; - std::cout << "-gpgpu_operand_collector_num_units_gen " - << WARP_SCHEDS_PER_SM * 2 << std::endl; - std::cout << "-gpgpu_operand_collector_num_in_ports_gen " - << WARP_SCHEDS_PER_SM * 2 << std::endl; - std::cout << "-gpgpu_operand_collector_num_out_ports_gen " - << WARP_SCHEDS_PER_SM * 2 << std::endl; - - std::cout << "-gpgpu_num_sched_per_core " << WARP_SCHEDS_PER_SM - << std::endl; - - std::cout << "-gpgpu_max_insn_issue_per_warp " << ISSUE_MODEL << std::endl; - std::cout << "-gpgpu_dual_issue_diff_exec_units " << (deviceProp.major > 3) - << std::endl; - - std::cout << "-gpgpu_inst_fetch_throughput " << WARP_SCHEDS_PER_SM - << std::endl; - - std::cout << "-gpgpu_shader_core_pipeline " - << deviceProp.maxThreadsPerMultiProcessor << ":" - << deviceProp.warpSize << std::endl; - std::cout << "-gpgpu_shader_cta " - << round_up_2n((unsigned)deviceProp.maxThreadsPerMultiProcessor / - deviceProp.warpSize / 2) - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile deleted file mode 100644 index 730b2803e..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = lat_double.cu - -EXE = lat_double - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu deleted file mode 100644 index 48f8f31c5..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "lat_double.h" - -int main() { - - intilizeDeviceProp(0); - - dpu_latency(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h deleted file mode 100644 index 7a0c29ed1..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_double/lat_double.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef LAT_DOUBLE_DEF_H -#define LAT_DOUBLE_DEF_H - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 4096 - -template -__global__ void dpu_latency(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "fma.rn.f64 %0, %1, %2 , %0;\n\t" - "}" - : "+d"(result), "+d"(s1), "+d"(s2)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float dpu_latency() { - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - double *data1 = (double *)malloc(TOTAL_THREADS * sizeof(double)); - double *data2 = (double *)malloc(TOTAL_THREADS * sizeof(double)); - double *res = (double *)malloc(TOTAL_THREADS * sizeof(double)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - double *data1_g; - double *data2_g; - double *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (double)i; - data2[i] = (double)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(double))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(double))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(double))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(double), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(double), - cudaMemcpyHostToDevice)); - - dpu_latency<<<1, 1>>>(startClk_g, stopClk_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(double), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4)); - printf("double-precision DPU latency = %f (clk)\n", latency); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return latency; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile deleted file mode 100644 index 8fffb3eb8..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = lat_float.cu - -EXE = lat_float - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu deleted file mode 100644 index c1b76a79d..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "lat_float.h" - -int main() { - - intilizeDeviceProp(0); - - fpu_latency(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h deleted file mode 100644 index d19926244..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_float/lat_float.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef LAT_FLOAT_DEF_H -#define LAT_FLOAT_DEF_H - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 4096 - -template -__global__ void fpu_latency(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "fma.rn.f32 %0, %1, %2 , %0;\n\t" - "}" - : "+f"(result), "+f"(s1), "+f"(s2)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float fpu_latency() { - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *data2 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *res = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - float *data1_g; - float *data2_g; - float *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (float)i; - data2[i] = (float)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - - fpu_latency<<>>( - startClk_g, stopClk_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4)); - printf("float-precision FPU latency = %f (clk)\n", latency); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return latency; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile deleted file mode 100644 index c8f97daed..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM50 := - -SRC = lat_half.cu - -EXE = lat_half - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu deleted file mode 100644 index 36184a04e..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "lat_half.h" - -int main() { - - intilizeDeviceProp(0); - - fpu16_latency(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h deleted file mode 100644 index 8c1a50549..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_half/lat_half.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef LAT_FP16_DEF_H -#define LAT_FP16_DEF_H - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 4096 - -__global__ void fpu16_latency(uint32_t *startClk, uint32_t *stopClk, - half *data1, half *data2, half *data3, - half *data4, half *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - half s2 = data2[gid]; - half s4 = data4[gid]; - half2 mult = __halves2half2(s2, s4); - half result1 = data1[gid]; - half result2 = data3[gid]; - half2 result = __halves2half2(result1, result2); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - result = result * mult + result; - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = __high2half(result) + __low2half(result); -} - -float fpu16_latency() { - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - half *data1 = (half *)malloc(TOTAL_THREADS * sizeof(half)); - half *data2 = (half *)malloc(TOTAL_THREADS * sizeof(half)); - half *res = (half *)malloc(TOTAL_THREADS * sizeof(half)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - half *data1_g; - half *data2_g; - half *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (half)i; - data2[i] = (half)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(half))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(half))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(half))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(half), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(half), - cudaMemcpyHostToDevice)); - - fpu16_latency<<>>( - startClk_g, stopClk_g, data1_g, data2_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(half), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES)); - printf("fpu16 latency = %f (clk)\n", latency); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return latency; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile deleted file mode 100644 index 311af1ffe..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = lat_int32.cu - -EXE = lat_int32 - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu deleted file mode 100644 index bd926104c..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "lat_int32.h" - -int main() { - - intilizeDeviceProp(0); - - int32_latency(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h b/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h deleted file mode 100644 index cacd3a86a..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef LAT_INT32_DEF_H -#define LAT_INT32_DEF_H - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" -#define REPEAT_TIMES 1024 - -template -__global__ void int32_latency(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *data2, T *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register T s1 = data1[gid]; - register T s2 = data2[gid]; - register T result = 0; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "mad.lo.s32 %0, %1, %2 , %0;\n\t" - "}" - : "+r"(result), "+r"(s1), "+r"(s2)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float int32_latency() { - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - int32_t *data1 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *data2 = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - int32_t *res = (int32_t *)malloc(TOTAL_THREADS * sizeof(int32_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - int32_t *data1_g; - int32_t *data2_g; - int32_t *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = (int32_t)i; - data2[i] = (int32_t)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&data2_g, TOTAL_THREADS * sizeof(int32_t))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(int32_t))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyHostToDevice)); - - int32_latency<<>>( - startClk_g, stopClk_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(int32_t), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4)); - printf("int32 latency = %f (clk)\n", latency); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return latency; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile deleted file mode 100644 index 755f2266d..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -GENCODE_SM50 := -GENCODE_SM61 := -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM60 := -GENCODE_SM62 := - -SRC = regfile_bw.cu - -EXE = regfile_bw - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu deleted file mode 100644 index 723e01f4b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu +++ /dev/null @@ -1,59 +0,0 @@ -#include "../MaxFlops_float/MaxFlops_float.h" -#include "../tensor_bw_half/tensor_bw_half.h" - -int main() { - intilizeDeviceProp(0); - - unsigned regfile_bw; - /* we measure the reg file BW based on the most demanding data instruction, - i.e. tensor cores. See slide 20 from Nvidia for more details at - https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf - */ - if (deviceProp.major >= 6) { // tesnor core unit was added since Volta - float fma_bw = tensor_max_flops(true); - - unsigned tensor_MACs_per_SM = round_up_2n(fma_bw); - - /* - two operands needs per MAC each cycle (A, B), C will be saved at the tensor - core accuamlte register - */ - regfile_bw = tensor_MACs_per_SM * sizeof(half) * 2; - } else { - // if less than volta calculate based on FP32 FMA - float flops = fpu_max_flops(); - - // divide by 2 as we need FMA throughput - unsigned FMA_throughput_per_SM = round_up_2n(flops / 2); - - // three operands needs per FMA each cycle (A, B, C) - regfile_bw = round_up_2n((float)FMA_throughput_per_SM * sizeof(float) * 3); - } - std::cout << "\nregfile_bw = " << regfile_bw << " (byte/SM)" << std::endl; - - if (ACCEL_SIM_MODE) { - - unsigned reg_ports; - // Nvidia starts to have dual port register file since volta - if (deviceProp.major < 6) - reg_ports = 1; - else - reg_ports = 2; - - // WARP_SIZE*4 bytes, as registers are 32-bit width - unsigned banks_num = regfile_bw / (WARP_SIZE * 4) / reg_ports; - - /* we multiply by two as accel-sim does not model register file cache (added - since kepler) so to mitigate, the reg file bw comes from RFC, we - conservatively multiply the banks by 2 (to fix) - */ - if (deviceProp.major > 3) - banks_num = banks_num * 2; - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_num_reg_banks " << banks_num << std::endl; - std::cout << "-gpgpu_reg_file_port_throughput " << reg_ports << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile deleted file mode 100644 index d5aa2f3f0..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = sfu_bw_fsqrt.cu - -EXE = sfu_bw_fsqrt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu deleted file mode 100644 index 023799f04..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "sfu_bw_fsqrt.h" - -int main() { - - intilizeDeviceProp(0); - - sfu_max_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h deleted file mode 100644 index 922c72dff..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef MAXFLOPS_SFU_DEF_H -#define MAXFLOPS_SFU_DEF_H - -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -__global__ void max_flops(uint64_t *startClk, uint64_t *stopClk, float *data1, - float *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register float s1 = data1[gid]; - register float result = s1; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "sqrt.approx.ftz.f32 %0, %0;\n\t" - "sqrt.approx.ftz.f32 %0, %0;\n\t" - "sqrt.approx.ftz.f32 %0, %0;\n\t" - "sqrt.approx.ftz.f32 %0, %0;\n\t" - "}" - : "+f"(result)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float sfu_max_flops() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *res = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - float *data1_g; - float *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = 987654321.789456 + (float)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - - max_flops<<>>(startClk_g, stopClk_g, data1_g, - res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float flops; - flops = (float)(REPEAT_TIMES * TOTAL_THREADS * 4) / - ((float)(stopClk[0] - startClk[0])); - std::cout << "SFU fast sqrt bw = " << flops << "(flops/clk/SM) \n"; - std::cout << "Total Clk number = " << (stopClk[0] - startClk[0]) << "\n"; - - return flops; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile deleted file mode 100644 index eff045c24..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = sfu_lat_fsqrt.cu - -EXE = sfu_lat_fsqrt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu deleted file mode 100644 index a9fed49ec..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "sfu_lat_fsqrt.h" - -int main() { - - intilizeDeviceProp(0); - - sfu_latency(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h b/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h deleted file mode 100644 index df004f3f2..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.h +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef LAT_SFU_DEF_H -#define LAT_SFU_DEF_H - -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 1024 - -__global__ void sfu_latency(uint64_t *startClk, uint64_t *stopClk, float *data1, - float *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - register float s1 = data1[gid]; - register float result = s1; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - asm volatile("{\t\n" - "sin.approx.ftz.f32 %0, %0;\n\t" - "sin.approx.ftz.f32 %0, %0;\n\t" - "sin.approx.ftz.f32 %0, %0;\n\t" - "sin.approx.ftz.f32 %0, %0;\n\t" - "}" - : "+f"(result)); - } - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; - res[gid] = result; -} - -float sfu_latency() { - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - float *data1 = (float *)malloc(TOTAL_THREADS * sizeof(float)); - float *res = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - float *data1_g; - float *res_g; - - for (uint32_t i = 0; i < TOTAL_THREADS; i++) { - data1[i] = 10.124234521; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&data1_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(data1_g, data1, TOTAL_THREADS * sizeof(float), - cudaMemcpyHostToDevice)); - - sfu_latency<<>>(startClk_g, stopClk_g, data1_g, - res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(res, res_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float latency; - latency = ((float)(stopClk[0] - startClk[0])) / ((float)(REPEAT_TIMES * 4)); - std::cout << "SFU fast sqrt latency = " << latency << "(clk) \n"; - std::cout << "Total Clk number = " << (stopClk[0] - startClk[0]) << "\n"; - - return latency; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile deleted file mode 100644 index c55e26c36..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -GENCODE_SM50 := -GENCODE_SM61 := -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM60 := -GENCODE_SM62 := - -SRC = tensor_bw_half.cu - -EXE = tensor_bw_half - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt deleted file mode 100644 index 53977dfec..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sass.txt +++ /dev/null @@ -1,632 +0,0 @@ - -Fatbin elf code: -================ -arch = sm_70 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_70 - -Fatbin elf code: -================ -arch = sm_75 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_75 - -Fatbin elf code: -================ -arch = sm_80 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_80 - -Fatbin elf code: -================ -arch = sm_86 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_86 - -Fatbin elf code: -================ -arch = sm_70 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_70 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j - .headerflags @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)" - /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */ - /* 0x000fd000078e00ff */ - /*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /* 0x000000fffffff389 */ - /* 0x000fe200000e00ff */ - /*0020*/ S2R R5, SR_LANEID ; /* 0x0000000000057919 */ - /* 0x000e220000000000 */ - /*0030*/ MOV R26, 0x2 ; /* 0x00000002001a7802 */ - /* 0x000fe20000000f00 */ - /*0040*/ IMAD.MOV.U32 R22, RZ, RZ, 0x10 ; /* 0x00000010ff167424 */ - /* 0x000fe400078e00ff */ - /*0050*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e680000002100 */ - /*0060*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e620000002500 */ - /*0070*/ SHF.R.U32.HI R2, RZ, 0x2, R5.reuse ; /* 0x00000002ff027819 */ - /* 0x101fe40000011605 */ - /*0080*/ SHF.R.U32.HI R4, RZ, 0x4, R5 ; /* 0x00000004ff047819 */ - /* 0x000fc40000011605 */ - /*0090*/ LOP3.LUT R2, R2, 0x3, RZ, 0xc0, !PT ; /* 0x0000000302027812 */ - /* 0x000fe400078ec0ff */ - /*00a0*/ LOP3.LUT R6, R4, 0x1, RZ, 0xc0, !PT ; /* 0x0000000104067812 */ - /* 0x000fe200078ec0ff */ - /*00b0*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x002fe200078e0200 */ - /*00c0*/ LOP3.LUT R3, R5, 0x3, RZ, 0xc0, !PT ; /* 0x0000000305037812 */ - /* 0x000fe200078ec0ff */ - /*00d0*/ IMAD.SHL.U32 R8, R2, 0x8, RZ ; /* 0x0000000802087824 */ - /* 0x000fe200078e00ff */ - /*00e0*/ SHF.R.U32.HI R2, RZ, 0x1, R2 ; /* 0x00000001ff027819 */ - /* 0x000fe40000011602 */ - /*00f0*/ SHF.L.U32 R27, R0, 0xc, RZ ; /* 0x0000000c001b7819 */ - /* 0x000fe400000006ff */ - /*0100*/ LOP3.LUT R5, R8, 0x8, R3.reuse, 0xe2, !PT ; /* 0x0000000808057812 */ - /* 0x100fe200078ee203 */ - /*0110*/ IMAD R7, R2, 0x8, R3 ; /* 0x0000000802077824 */ - /* 0x000fc600078e0203 */ - /*0120*/ LEA R3, R6.reuse, R5, 0x2 ; /* 0x0000000506037211 */ - /* 0x040fe200078e10ff */ - /*0130*/ IMAD R7, R6, 0x4, R7 ; /* 0x0000000406077824 */ - /* 0x000fe400078e0207 */ - /*0140*/ IMAD.WIDE.U32 R4, R27, R26, c[0x0][0x170] ; /* 0x00005c001b047625 */ - /* 0x000fe200078e001a */ - /*0150*/ SHF.L.U32 R21, R3, 0x1, RZ ; /* 0x0000000103157819 */ - /* 0x000fe400000006ff */ - /*0160*/ SHF.L.U32 R7, R7, 0x1, RZ ; /* 0x0000000107077819 */ - /* 0x000fca00000006ff */ - /*0170*/ IMAD.WIDE.U32 R20, R21, 0x10, R4 ; /* 0x0000001015147825 */ - /* 0x000fc800078e0004 */ - /*0180*/ IMAD.WIDE.U32 R22, R7, R22, c[0x0][0x178] ; /* 0x00005e0007167625 */ - /* 0x000fc800078e0016 */ - /*0190*/ LDG.E.128.SYS R12, [R20] ; /* 0x00000000140c7381 */ - /* 0x00012800001eed00 */ - /*01a0*/ LDG.E.128.SYS R4, [R20+0x10] ; /* 0x0000100014047381 */ - /* 0x00012800001eed00 */ - /*01b0*/ LDG.E.128.SYS R16, [R22] ; /* 0x0000000016107381 */ - /* 0x00012800001eed00 */ - /*01c0*/ LDG.E.128.SYS R8, [R22+0x10] ; /* 0x0000100016087381 */ - /* 0x00012200001eed00 */ - /*01d0*/ IMAD.WIDE.U32 R26, R27, R26, c[0x0][0x180] ; /* 0x000060001b1a7625 */ - /* 0x000fc600078e001a */ - /*01e0*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fe20000000000 */ - /*01f0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*0200*/ CS2R R24, SR_CLOCKLO ; /* 0x0000000000187805 */ - /* 0x000fd00000015000 */ - /*0210*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fe20000000000 */ - /*0220*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*0230*/ CS2R R20, SRZ ; /* 0x0000000000147805 */ - /* 0x001fe2000001ff00 */ - /*0240*/ CS2R R22, SRZ ; /* 0x0000000000167805 */ - /* 0x000fca000001ff00 */ - /*0250*/ HMMA.884.F16.F16.STEP0 R20, R12.reuse.ROW, R16.reuse.COL, R20 ; /* 0x000000100c147236 */ - /* 0x0d0fe80000000414 */ - /*0260*/ HMMA.884.F16.F16.STEP1 R22, R12.ROW, R16.COL, R22 ; /* 0x000000100c167236 */ - /* 0x000f680000008416 */ - /*0270*/ HMMA.884.F16.F16.STEP0 R20, R14.reuse.ROW, R18.reuse.COL, R20 ; /* 0x000000120e147236 */ - /* 0x0e0fe80000000414 */ - /*0280*/ HMMA.884.F16.F16.STEP1 R22, R14.ROW, R18.COL, R22 ; /* 0x000000120e167236 */ - /* 0x000f680000008416 */ - /*0290*/ HMMA.884.F16.F16.STEP0 R20, R4.reuse.ROW, R8.reuse.COL, R20 ; /* 0x0000000804147236 */ - /* 0x0e0fe80000000414 */ - /*02a0*/ HMMA.884.F16.F16.STEP1 R22, R4.ROW, R8.COL, R22 ; /* 0x0000000804167236 */ - /* 0x000f680000008416 */ - /*02b0*/ HMMA.884.F16.F16.STEP0 R20, R6.reuse.ROW, R10.reuse.COL, R20 ; /* 0x0000000a06147236 */ - /* 0x0e0b680000000414 */ - /*02c0*/ HMMA.884.F16.F16.STEP1 R22, R6.ROW, R10.COL, R22 ; /* 0x0000000a06167236 */ - /* 0x000b5a0000008416 */ - /*02d0*/ CS2R R8, SR_CLOCKLO ; /* 0x0000000000087805 */ - /* 0x000fd00000015000 */ - /*02e0*/ MOV R5, RZ ; /* 0x000000ff00057202 */ - /* 0x000fe20000000f00 */ - /*02f0*/ IMAD.SHL.U32 R4, R2, 0x8, RZ ; /* 0x0000000802047824 */ - /* 0x000fc400078e00ff */ - /*0300*/ IMAD.MOV.U32 R7, RZ, RZ, 0x8 ; /* 0x00000008ff077424 */ - /* 0x020fe400078e00ff */ - /*0310*/ IMAD.WIDE.U32 R4, R3, 0x10, R4 ; /* 0x0000001003047825 */ - /* 0x000fc800078e0004 */ - /*0320*/ IMAD.WIDE.U32 R2, R0, R7, c[0x0][0x160] ; /* 0x0000580000027625 */ - /* 0x000fc600078e0007 */ - /*0330*/ LEA R10, P0, R4, R26, 0x1 ; /* 0x0000001a040a7211 */ - /* 0x000fe200078008ff */ - /*0340*/ IMAD.WIDE.U32 R6, R0, R7, c[0x0][0x168] ; /* 0x00005a0000067625 */ - /* 0x000fc600078e0007 */ - /*0350*/ LEA.HI.X R11, R4, R27, R5, 0x1, P0 ; /* 0x0000001b040b7211 */ - /* 0x000fd000000f0c05 */ - /*0360*/ STG.E.128.SYS [R10], R20 ; /* 0x000000140a007386 */ - /* 0x000fe8000010ed00 */ - /*0370*/ STG.E.64.SYS [R2], R24 ; /* 0x0000001802007386 */ - /* 0x000fe8000010eb00 */ - /*0380*/ STG.E.64.SYS [R6], R8 ; /* 0x0000000806007386 */ - /* 0x000fe2000010eb00 */ - /*0390*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*03a0*/ BRA 0x3a0; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*03b0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03c0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03d0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03e0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03f0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - .......... - - - -Fatbin ptx code: -================ -arch = sm_70 -code version = [7,1] -producer = -host = linux -compile_size = 64bit -compressed - -Fatbin elf code: -================ -arch = sm_75 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_75 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j - .headerflags @"EF_CUDA_SM75 EF_CUDA_PTX_SM(EF_CUDA_SM75)" - /*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */ - /* 0x000fd00000000f00 */ - /*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e220000002100 */ - /*0020*/ MOV R9, 0x2 ; /* 0x0000000200097802 */ - /* 0x000fe20000000f00 */ - /*0030*/ IMAD.MOV.U32 R7, RZ, RZ, 0x10 ; /* 0x00000010ff077424 */ - /* 0x000fe400078e00ff */ - /*0040*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e280000002500 */ - /*0050*/ S2R R5, SR_LANEID ; /* 0x0000000000057919 */ - /* 0x000e620000000000 */ - /*0060*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x001fe200078e0200 */ - /*0070*/ MOV R3, RZ ; /* 0x000000ff00037202 */ - /* 0x000fc40000000f00 */ - /*0080*/ LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ; /* 0x0000000305027812 */ - /* 0x002fe200078ec0ff */ - /*0090*/ IMAD.SHL.U32 R8, R0, 0x1000, RZ ; /* 0x0000100000087824 */ - /* 0x000fe200078e00ff */ - /*00a0*/ SHF.R.U32.HI R5, RZ, 0x2, R5 ; /* 0x00000002ff057819 */ - /* 0x000fca0000011605 */ - /*00b0*/ IMAD.WIDE.U32 R4, R5, 0x8, R2 ; /* 0x0000000805047825 */ - /* 0x000fc800078e0002 */ - /*00c0*/ IMAD.WIDE.U32 R2, R8, R9, c[0x0][0x170] ; /* 0x00005c0008027625 */ - /* 0x000fc600078e0009 */ - /*00d0*/ LEA R10, P1, R4, c[0x0][0x178], 0x2 ; /* 0x00005e00040a7a11 */ - /* 0x000fc800078210ff */ - /*00e0*/ LEA R2, P0, R4.reuse, R2, 0x2 ; /* 0x0000000204027211 */ - /* 0x040fe400078010ff */ - /*00f0*/ LEA.HI.X R11, R4.reuse, c[0x0][0x17c], R5.reuse, 0x2, P1 ; /* 0x00005f00040b7a11 */ - /* 0x140fe400008f1405 */ - /*0100*/ LEA.HI.X R3, R4, R3, R5, 0x2, P0 ; /* 0x0000000304037211 */ - /* 0x000fc600000f1405 */ - /*0110*/ IMAD.WIDE.U32 R12, R7, 0x10, R10 ; /* 0x00000010070c7825 */ - /* 0x000fc600078e000a */ - /*0120*/ LDG.E.SYS R18, [R10] ; /* 0x000000000a127381 */ - /* 0x00012200001ee900 */ - /*0130*/ IMAD.WIDE.U32 R6, R7, 0x10, R2 ; /* 0x0000001007067825 */ - /* 0x000fc600078e0002 */ - /*0140*/ LDG.E.SYS R19, [R10+0x10] ; /* 0x000010000a137381 */ - /* 0x00012800001ee900 */ - /*0150*/ LDG.E.SYS R20, [R12] ; /* 0x000000000c147381 */ - /* 0x00012800001ee900 */ - /*0160*/ LDG.E.SYS R21, [R12+0x10] ; /* 0x000010000c157381 */ - /* 0x00012800001ee900 */ - /*0170*/ LDG.E.SYS R14, [R2] ; /* 0x00000000020e7381 */ - /* 0x00012800001ee900 */ - /*0180*/ LDG.E.SYS R16, [R2+0x10] ; /* 0x0000100002107381 */ - /* 0x00012800001ee900 */ - /*0190*/ LDG.E.SYS R15, [R6] ; /* 0x00000000060f7381 */ - /* 0x00012800001ee900 */ - /*01a0*/ LDG.E.SYS R17, [R6+0x10] ; /* 0x0000100006117381 */ - /* 0x00012200001ee900 */ - /*01b0*/ IMAD.WIDE.U32 R8, R8, R9, c[0x0][0x180] ; /* 0x0000600008087625 */ - /* 0x000fc600078e0009 */ - /*01c0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*01d0*/ CS2R R10, SR_CLOCKLO ; /* 0x00000000000a7805 */ - /* 0x001fd00000015000 */ - /*01e0*/ CS2R R2, SRZ ; /* 0x0000000000027805 */ - /* 0x000fe2000001ff00 */ - /*01f0*/ CS2R R6, SRZ ; /* 0x0000000000067805 */ - /* 0x000fe2000001ff00 */ - /*0200*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*0210*/ HMMA.1688.F16 R2, R14, R18, R2 ; /* 0x000000120e02723c */ - /* 0x010f700000000002 */ - /*0220*/ HMMA.1688.F16 R6, R14, R20, R6 ; /* 0x000000140e06723c */ - /* 0x000f700000000006 */ - /*0230*/ HMMA.1688.F16 R18, R16, R19, R2 ; /* 0x000000131012723c */ - /* 0x020b700000000002 */ - /*0240*/ HMMA.1688.F16 R20, R16, R21, R6 ; /* 0x000000151014723c */ - /* 0x000b5c0000000006 */ - /*0250*/ CS2R R12, SR_CLOCKLO ; /* 0x00000000000c7805 */ - /* 0x000fd00000015000 */ - /*0260*/ LEA R2, P0, R4, R8, 0x2 ; /* 0x0000000804027211 */ - /* 0x020fc800078010ff */ - /*0270*/ LEA.HI.X R3, R4, R9, R5, 0x2, P0 ; /* 0x0000000904037211 */ - /* 0x000fe200000f1405 */ - /*0280*/ IMAD.MOV.U32 R9, RZ, RZ, 0x8 ; /* 0x00000008ff097424 */ - /* 0x000fe200078e00ff */ - /*0290*/ MOV R5, 0x20 ; /* 0x0000002000057802 */ - /* 0x000fc60000000f00 */ - /*02a0*/ IMAD.WIDE.U32 R6, R0, R9, c[0x0][0x160] ; /* 0x0000580000067625 */ - /* 0x000fc600078e0009 */ - /*02b0*/ STG.E.SYS [R2], R18 ; /* 0x0000001202007386 */ - /* 0x000fe2000010e900 */ - /*02c0*/ IMAD.WIDE.U32 R4, R5, 0x8, R2 ; /* 0x0000000805047825 */ - /* 0x000fc800078e0002 */ - /*02d0*/ IMAD.WIDE.U32 R8, R0, R9, c[0x0][0x168] ; /* 0x00005a0000087625 */ - /* 0x000fc800078e0009 */ - /*02e0*/ STG.E.SYS [R4], R19 ; /* 0x0000001304007386 */ - /* 0x000fe8000010e900 */ - /*02f0*/ STG.E.SYS [R2+0x10], R20 ; /* 0x0000101402007386 */ - /* 0x000fe8000010e900 */ - /*0300*/ STG.E.SYS [R4+0x10], R21 ; /* 0x0000101504007386 */ - /* 0x000fe8000010e900 */ - /*0310*/ STG.E.64.SYS [R6], R10 ; /* 0x0000000a06007386 */ - /* 0x000fe8000010eb00 */ - /*0320*/ STG.E.64.SYS [R8], R12 ; /* 0x0000000c08007386 */ - /* 0x000fe2000010eb00 */ - /*0330*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*0340*/ BRA 0x340; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*0350*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0360*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0370*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - .......... - - - -Fatbin ptx code: -================ -arch = sm_75 -code version = [7,1] -producer = -host = linux -compile_size = 64bit -compressed - -Fatbin elf code: -================ -arch = sm_80 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_80 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j - .headerflags @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)" - /*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */ - /* 0x000fce0000000f00 */ - /*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e220000002100 */ - /*0020*/ MOV R17, 0x2 ; /* 0x0000000200117802 */ - /* 0x000fe20000000f00 */ - /*0030*/ IMAD.MOV.U32 R15, RZ, RZ, 0x10 ; /* 0x00000010ff0f7424 */ - /* 0x000fe200078e00ff */ - /*0040*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */ - /* 0x000fe20000000a00 */ - /*0050*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e280000002500 */ - /*0060*/ S2R R5, SR_LANEID ; /* 0x0000000000057919 */ - /* 0x000e620000000000 */ - /*0070*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x001fe200078e0200 */ - /*0080*/ MOV R3, RZ ; /* 0x000000ff00037202 */ - /* 0x000fc40000000f00 */ - /*0090*/ LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ; /* 0x0000000305027812 */ - /* 0x002fe200078ec0ff */ - /*00a0*/ IMAD.SHL.U32 R16, R0, 0x1000, RZ ; /* 0x0000100000107824 */ - /* 0x000fe200078e00ff */ - /*00b0*/ SHF.R.U32.HI R5, RZ, 0x2, R5 ; /* 0x00000002ff057819 */ - /* 0x000fca0000011605 */ - /*00c0*/ IMAD.WIDE.U32 R2, R5, 0x8, R2 ; /* 0x0000000805027825 */ - /* 0x000fc800078e0002 */ - /*00d0*/ IMAD.WIDE.U32 R4, R16, R17, c[0x0][0x170] ; /* 0x00005c0010047625 */ - /* 0x000fe200078e0011 */ - /*00e0*/ LEA R18, P1, R2, c[0x0][0x178], 0x2 ; /* 0x00005e0002127a11 */ - /* 0x000fc800078210ff */ - /*00f0*/ LEA R8, P0, R2.reuse, R4, 0x2 ; /* 0x0000000402087211 */ - /* 0x040fe400078010ff */ - /*0100*/ LEA.HI.X R19, R2.reuse, c[0x0][0x17c], R3.reuse, 0x2, P1 ; /* 0x00005f0002137a11 */ - /* 0x140fe400008f1403 */ - /*0110*/ LEA.HI.X R9, R2, R5, R3, 0x2, P0 ; /* 0x0000000502097211 */ - /* 0x000fc600000f1403 */ - /*0120*/ IMAD.WIDE.U32 R20, R15.reuse, 0x10, R18 ; /* 0x000000100f147825 */ - /* 0x040fe200078e0012 */ - /*0130*/ LDG.E R10, [R18.64] ; /* 0x00000004120a7981 */ - /* 0x000126000c1e1900 */ - /*0140*/ IMAD.WIDE.U32 R14, R15, 0x10, R8 ; /* 0x000000100f0e7825 */ - /* 0x000fe200078e0008 */ - /*0150*/ LDG.E R11, [R18.64+0x10] ; /* 0x00001004120b7981 */ - /* 0x000128000c1e1900 */ - /*0160*/ LDG.E R12, [R20.64] ; /* 0x00000004140c7981 */ - /* 0x000128000c1e1900 */ - /*0170*/ LDG.E R13, [R20.64+0x10] ; /* 0x00001004140d7981 */ - /* 0x000128000c1e1900 */ - /*0180*/ LDG.E R4, [R8.64] ; /* 0x0000000408047981 */ - /* 0x000128000c1e1900 */ - /*0190*/ LDG.E R6, [R8.64+0x10] ; /* 0x0000100408067981 */ - /* 0x000128000c1e1900 */ - /*01a0*/ LDG.E R5, [R14.64] ; /* 0x000000040e057981 */ - /* 0x000128000c1e1900 */ - /*01b0*/ LDG.E R7, [R14.64+0x10] ; /* 0x000010040e077981 */ - /* 0x000122000c1e1900 */ - /*01c0*/ IMAD.WIDE.U32 R16, R16, R17, c[0x0][0x180] ; /* 0x0000600010107625 */ - /* 0x000fc600078e0011 */ - /*01d0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*01e0*/ CS2R R8, SR_CLOCKLO ; /* 0x0000000000087805 */ - /* 0x001fce0000015000 */ - /*01f0*/ CS2R R14, SRZ ; /* 0x00000000000e7805 */ - /* 0x000fe2000001ff00 */ - /*0200*/ CS2R R18, SRZ ; /* 0x0000000000127805 */ - /* 0x000fe2000001ff00 */ - /*0210*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*0220*/ HMMA.16816.F16 R10, R4.reuse, R10, R14 ; /* 0x0000000a040a723c */ - /* 0x050b70000000080e */ - /*0230*/ HMMA.16816.F16 R18, R4, R12, R18 ; /* 0x0000000c0412723c */ - /* 0x000b5e0000000812 */ - /*0240*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fd20000000000 */ - /*0250*/ CS2R R14, SR_CLOCKLO ; /* 0x00000000000e7805 */ - /* 0x020fce0000015000 */ - /*0260*/ LEA R4, P0, R2, R16, 0x2 ; /* 0x0000001002047211 */ - /* 0x000fe200078010ff */ - /*0270*/ IMAD.MOV.U32 R13, RZ, RZ, 0x8 ; /* 0x00000008ff0d7424 */ - /* 0x000fc600078e00ff */ - /*0280*/ LEA.HI.X R5, R2, R17, R3, 0x2, P0 ; /* 0x0000001102057211 */ - /* 0x000fe200000f1403 */ - /*0290*/ IMAD.WIDE.U32 R6, R0, R13, c[0x0][0x160] ; /* 0x0000580000067625 */ - /* 0x000fe200078e000d */ - /*02a0*/ MOV R3, 0x20 ; /* 0x0000002000037802 */ - /* 0x000fc60000000f00 */ - /*02b0*/ STG.E [R4.64], R10 ; /* 0x0000000a04007986 */ - /* 0x000fe2000c101904 */ - /*02c0*/ IMAD.WIDE.U32 R12, R0, R13, c[0x0][0x168] ; /* 0x00005a00000c7625 */ - /* 0x000fc800078e000d */ - /*02d0*/ IMAD.WIDE.U32 R2, R3, 0x8, R4 ; /* 0x0000000803027825 */ - /* 0x000fca00078e0004 */ - /*02e0*/ STG.E [R2.64], R11 ; /* 0x0000000b02007986 */ - /* 0x000fe8000c101904 */ - /*02f0*/ STG.E [R4.64+0x10], R18 ; /* 0x0000101204007986 */ - /* 0x000fe8000c101904 */ - /*0300*/ STG.E [R2.64+0x10], R19 ; /* 0x0000101302007986 */ - /* 0x000fe8000c101904 */ - /*0310*/ STG.E.64 [R6.64], R8 ; /* 0x0000000806007986 */ - /* 0x000fe8000c101b04 */ - /*0320*/ STG.E.64 [R12.64], R14 ; /* 0x0000000e0c007986 */ - /* 0x000fe2000c101b04 */ - /*0330*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*0340*/ BRA 0x340; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*0350*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0360*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0370*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0380*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0390*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03a0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03b0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03c0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03d0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03e0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03f0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - .......... - - - -Fatbin ptx code: -================ -arch = sm_80 -code version = [7,1] -producer = -host = linux -compile_size = 64bit -compressed - -Fatbin elf code: -================ -arch = sm_86 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_86 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_S3_j - .headerflags @"EF_CUDA_SM86 EF_CUDA_PTX_SM(EF_CUDA_SM86)" - /*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */ - /* 0x000fce0000000f00 */ - /*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e220000002100 */ - /*0020*/ MOV R13, 0x2 ; /* 0x00000002000d7802 */ - /* 0x000fe20000000f00 */ - /*0030*/ IMAD.MOV.U32 R11, RZ, RZ, 0x10 ; /* 0x00000010ff0b7424 */ - /* 0x000fe200078e00ff */ - /*0040*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */ - /* 0x000fe20000000a00 */ - /*0050*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e280000002500 */ - /*0060*/ S2R R5, SR_LANEID ; /* 0x0000000000057919 */ - /* 0x000e620000000000 */ - /*0070*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x001fe200078e0200 */ - /*0080*/ MOV R3, RZ ; /* 0x000000ff00037202 */ - /* 0x000fc40000000f00 */ - /*0090*/ LOP3.LUT R2, R5, 0x3, RZ, 0xc0, !PT ; /* 0x0000000305027812 */ - /* 0x002fe200078ec0ff */ - /*00a0*/ IMAD.SHL.U32 R12, R0, 0x1000, RZ ; /* 0x00001000000c7824 */ - /* 0x000fe200078e00ff */ - /*00b0*/ SHF.R.U32.HI R5, RZ, 0x2, R5 ; /* 0x00000002ff057819 */ - /* 0x000fca0000011605 */ - /*00c0*/ IMAD.WIDE.U32 R2, R5, 0x8, R2 ; /* 0x0000000805027825 */ - /* 0x000fc800078e0002 */ - /*00d0*/ IMAD.WIDE.U32 R4, R12, R13, c[0x0][0x170] ; /* 0x00005c000c047625 */ - /* 0x000fe200078e000d */ - /*00e0*/ LEA R14, P1, R2, c[0x0][0x178], 0x2 ; /* 0x00005e00020e7a11 */ - /* 0x000fc800078210ff */ - /*00f0*/ LEA R8, P0, R2.reuse, R4, 0x2 ; /* 0x0000000402087211 */ - /* 0x040fe400078010ff */ - /*0100*/ LEA.HI.X R15, R2.reuse, c[0x0][0x17c], R3.reuse, 0x2, P1 ; /* 0x00005f00020f7a11 */ - /* 0x140fe400008f1403 */ - /*0110*/ LEA.HI.X R9, R2, R5, R3, 0x2, P0 ; /* 0x0000000502097211 */ - /* 0x000fc600000f1403 */ - /*0120*/ IMAD.WIDE.U32 R16, R11.reuse, 0x10, R14 ; /* 0x000000100b107825 */ - /* 0x040fe200078e000e */ - /*0130*/ LDG.E R18, [R14.64] ; /* 0x000000040e127981 */ - /* 0x000126000c1e1900 */ - /*0140*/ IMAD.WIDE.U32 R10, R11, 0x10, R8 ; /* 0x000000100b0a7825 */ - /* 0x000fe200078e0008 */ - /*0150*/ LDG.E R19, [R14.64+0x10] ; /* 0x000010040e137981 */ - /* 0x000128000c1e1900 */ - /*0160*/ LDG.E R20, [R16.64] ; /* 0x0000000410147981 */ - /* 0x000128000c1e1900 */ - /*0170*/ LDG.E R21, [R16.64+0x10] ; /* 0x0000100410157981 */ - /* 0x000128000c1e1900 */ - /*0180*/ LDG.E R4, [R8.64] ; /* 0x0000000408047981 */ - /* 0x000128000c1e1900 */ - /*0190*/ LDG.E R6, [R8.64+0x10] ; /* 0x0000100408067981 */ - /* 0x000128000c1e1900 */ - /*01a0*/ LDG.E R5, [R10.64] ; /* 0x000000040a057981 */ - /* 0x000128000c1e1900 */ - /*01b0*/ LDG.E R7, [R10.64+0x10] ; /* 0x000010040a077981 */ - /* 0x000122000c1e1900 */ - /*01c0*/ IMAD.WIDE.U32 R12, R12, R13, c[0x0][0x180] ; /* 0x000060000c0c7625 */ - /* 0x000fc600078e000d */ - /*01d0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*01e0*/ CS2R R8, SR_CLOCKLO ; /* 0x0000000000087805 */ - /* 0x001fce0000015000 */ - /*01f0*/ CS2R R10, SRZ ; /* 0x00000000000a7805 */ - /* 0x000fe2000001ff00 */ - /*0200*/ CS2R R14, SRZ ; /* 0x00000000000e7805 */ - /* 0x000fe2000001ff00 */ - /*0210*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*0220*/ HMMA.16816.F16 R18, R4.reuse, R18, R10 ; /* 0x000000120412723c */ - /* 0x050b70000000080a */ - /*0230*/ HMMA.16816.F16 R20, R4, R20, R14 ; /* 0x000000140414723c */ - /* 0x000b5e000000080e */ - /*0240*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fd20000000000 */ - /*0250*/ CS2R R14, SR_CLOCKLO ; /* 0x00000000000e7805 */ - /* 0x020fce0000015000 */ - /*0260*/ LEA R4, P0, R2, R12, 0x2 ; /* 0x0000000c02047211 */ - /* 0x000fe200078010ff */ - /*0270*/ IMAD.MOV.U32 R11, RZ, RZ, 0x8 ; /* 0x00000008ff0b7424 */ - /* 0x000fc600078e00ff */ - /*0280*/ LEA.HI.X R5, R2, R13, R3, 0x2, P0 ; /* 0x0000000d02057211 */ - /* 0x000fe200000f1403 */ - /*0290*/ IMAD.WIDE.U32 R6, R0, R11, c[0x0][0x160] ; /* 0x0000580000067625 */ - /* 0x000fe200078e000b */ - /*02a0*/ MOV R3, 0x20 ; /* 0x0000002000037802 */ - /* 0x000fc60000000f00 */ - /*02b0*/ STG.E [R4.64], R18 ; /* 0x0000001204007986 */ - /* 0x000fe2000c101904 */ - /*02c0*/ IMAD.WIDE.U32 R10, R0, R11, c[0x0][0x168] ; /* 0x00005a00000a7625 */ - /* 0x000fc800078e000b */ - /*02d0*/ IMAD.WIDE.U32 R2, R3, 0x8, R4 ; /* 0x0000000803027825 */ - /* 0x000fca00078e0004 */ - /*02e0*/ STG.E [R2.64], R19 ; /* 0x0000001302007986 */ - /* 0x000fe8000c101904 */ - /*02f0*/ STG.E [R4.64+0x10], R20 ; /* 0x0000101404007986 */ - /* 0x000fe8000c101904 */ - /*0300*/ STG.E [R2.64+0x10], R21 ; /* 0x0000101502007986 */ - /* 0x000fe8000c101904 */ - /*0310*/ STG.E.64 [R6.64], R8 ; /* 0x0000000806007986 */ - /* 0x000fe8000c101b04 */ - /*0320*/ STG.E.64 [R10.64], R14 ; /* 0x0000000e0a007986 */ - /* 0x000fe2000c101b04 */ - /*0330*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*0340*/ BRA 0x340; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*0350*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0360*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0370*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0380*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0390*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03a0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03b0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03c0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03d0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03e0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03f0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - .......... - - - -Fatbin ptx code: -================ -arch = sm_86 -code version = [7,1] -producer = -host = linux -compile_size = 64bit -compressed diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt deleted file mode 100644 index b453784a6..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/sassfloat.txt +++ /dev/null @@ -1,502 +0,0 @@ - -Fatbin elf code: -================ -arch = sm_70 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_70 - -Fatbin elf code: -================ -arch = sm_75 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_75 - -Fatbin elf code: -================ -arch = sm_80 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_80 - -Fatbin elf code: -================ -arch = sm_70 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_70 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj - .headerflags @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)" - /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */ - /* 0x000fd000078e00ff */ - /*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /* 0x000000fffffff389 */ - /* 0x000fe200000e00ff */ - /*0020*/ S2R R29, SR_LANEID ; /* 0x00000000001d7919 */ - /* 0x000e220000000000 */ - /*0030*/ IMAD.MOV.U32 R11, RZ, RZ, 0x2 ; /* 0x00000002ff0b7424 */ - /* 0x000fe400078e00ff */ - /*0040*/ IMAD.MOV.U32 R8, RZ, RZ, 0x10 ; /* 0x00000010ff087424 */ - /* 0x000fe200078e00ff */ - /*0050*/ S2R R28, SR_TID.X ; /* 0x00000000001c7919 */ - /* 0x000e680000002100 */ - /*0060*/ S2R R5, SR_CTAID.X ; /* 0x0000000000057919 */ - /* 0x000e620000002500 */ - /*0070*/ SHF.R.U32.HI R3, RZ, 0x2, R29.reuse ; /* 0x00000002ff037819 */ - /* 0x101fe4000001161d */ - /*0080*/ SHF.R.U32.HI R2, RZ, 0x4, R29 ; /* 0x00000004ff027819 */ - /* 0x000fc4000001161d */ - /*0090*/ LOP3.LUT R3, R3, 0x3, RZ, 0xc0, !PT ; /* 0x0000000303037812 */ - /* 0x000fe400078ec0ff */ - /*00a0*/ LOP3.LUT R29, R29, 0x3, RZ, 0xc0, !PT ; /* 0x000000031d1d7812 */ - /* 0x000fe400078ec0ff */ - /*00b0*/ SHF.R.U32.HI R0, RZ, 0x1, R3 ; /* 0x00000001ff007819 */ - /* 0x000fe20000011603 */ - /*00c0*/ IMAD R28, R5, c[0x0][0x0], R28 ; /* 0x00000000051c7a24 */ - /* 0x002fe200078e021c */ - /*00d0*/ SHF.L.U32 R4, R3, 0x3, RZ ; /* 0x0000000303047819 */ - /* 0x000fe400000006ff */ - /*00e0*/ LOP3.LUT R6, R2, 0x1, RZ, 0xc0, !PT ; /* 0x0000000102067812 */ - /* 0x000fe200078ec0ff */ - /*00f0*/ IMAD R9, R0, 0x8, R29.reuse ; /* 0x0000000800097824 */ - /* 0x100fe200078e021d */ - /*0100*/ LOP3.LUT R5, R4, 0x8, R29, 0xe2, !PT ; /* 0x0000000804057812 */ - /* 0x000fe200078ee21d */ - /*0110*/ IMAD.SHL.U32 R32, R28, 0x1000, RZ ; /* 0x000010001c207824 */ - /* 0x000fc400078e00ff */ - /*0120*/ IMAD R9, R6.reuse, 0x4, R9 ; /* 0x0000000406097824 */ - /* 0x040fe200078e0209 */ - /*0130*/ LEA R7, R6, R5, 0x2 ; /* 0x0000000506077211 */ - /* 0x000fe200078e10ff */ - /*0140*/ IMAD.WIDE.U32 R4, R32, R11, c[0x0][0x170] ; /* 0x00005c0020047625 */ - /* 0x000fc600078e000b */ - /*0150*/ SHF.L.U32 R7, R7, 0x1, RZ ; /* 0x0000000107077819 */ - /* 0x000fe400000006ff */ - /*0160*/ SHF.L.U32 R9, R9, 0x1, RZ ; /* 0x0000000109097819 */ - /* 0x000fc600000006ff */ - /*0170*/ IMAD.WIDE.U32 R6, R7, 0x10, R4 ; /* 0x0000001007067825 */ - /* 0x000fc800078e0004 */ - /*0180*/ IMAD.WIDE.U32 R4, R9, R8, c[0x0][0x178] ; /* 0x00005e0009047625 */ - /* 0x000fcc00078e0008 */ - /*0190*/ LDG.E.128.SYS R24, [R6] ; /* 0x0000000006187381 */ - /* 0x00012800001eed00 */ - /*01a0*/ LDG.E.128.SYS R20, [R6+0x10] ; /* 0x0000100006147381 */ - /* 0x00012800001eed00 */ - /*01b0*/ LDG.E.128.SYS R16, [R4] ; /* 0x0000000004107381 */ - /* 0x00012800001eed00 */ - /*01c0*/ LDG.E.128.SYS R12, [R4+0x10] ; /* 0x00001000040c7381 */ - /* 0x00012200001eed00 */ - /*01d0*/ IMAD.MOV.U32 R33, RZ, RZ, 0x4 ; /* 0x00000004ff217424 */ - /* 0x000fc800078e00ff */ - /*01e0*/ IMAD.WIDE.U32 R32, R32, R33, c[0x0][0x180] ; /* 0x0000600020207625 */ - /* 0x000fe200078e0021 */ - /*01f0*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fe20000000000 */ - /*0200*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fee0000000000 */ - /*0210*/ CS2R R30, SR_CLOCKLO ; /* 0x00000000001e7805 */ - /* 0x000fd00000015000 */ - /*0220*/ CS2R R8, SRZ ; /* 0x0000000000087805 */ - /* 0x000fe2000001ff00 */ - /*0230*/ CS2R R10, SRZ ; /* 0x00000000000a7805 */ - /* 0x000fe2000001ff00 */ - /*0240*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fe20000000000 */ - /*0250*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*0260*/ CS2R R4, SRZ ; /* 0x0000000000047805 */ - /* 0x001fe2000001ff00 */ - /*0270*/ CS2R R6, SRZ ; /* 0x0000000000067805 */ - /* 0x000fe2000001ff00 */ - /*0280*/ HMMA.884.F32.F32.STEP0 R8, R24.reuse.ROW, R16.reuse.COL, R8 ; /* 0x0000001018087236 */ - /* 0x0d0fe40000005408 */ - /*0290*/ HMMA.884.F32.F32.STEP1 R10, R24.reuse.ROW, R16.reuse.COL, R10 ; /* 0x00000010180a7236 */ - /* 0x0c0fe4000000d40a */ - /*02a0*/ HMMA.884.F32.F32.STEP2 R4, R24.reuse.ROW, R16.reuse.COL, R4 ; /* 0x0000001018047236 */ - /* 0x0c0fe40000015404 */ - /*02b0*/ HMMA.884.F32.F32.STEP3 R6, R24.ROW, R16.COL, R6 ; /* 0x0000001018067236 */ - /* 0x000f64000001d406 */ - /*02c0*/ HMMA.884.F32.F32.STEP0 R8, R26.reuse.ROW, R18.reuse.COL, R8 ; /* 0x000000121a087236 */ - /* 0x0e0fe40000005408 */ - /*02d0*/ HMMA.884.F32.F32.STEP1 R10, R26.reuse.ROW, R18.reuse.COL, R10 ; /* 0x000000121a0a7236 */ - /* 0x0c0fe4000000d40a */ - /*02e0*/ HMMA.884.F32.F32.STEP2 R4, R26.reuse.ROW, R18.reuse.COL, R4 ; /* 0x000000121a047236 */ - /* 0x0c0fe40000015404 */ - /*02f0*/ HMMA.884.F32.F32.STEP3 R6, R26.ROW, R18.COL, R6 ; /* 0x000000121a067236 */ - /* 0x000f64000001d406 */ - /*0300*/ HMMA.884.F32.F32.STEP0 R8, R20.reuse.ROW, R12.reuse.COL, R8 ; /* 0x0000000c14087236 */ - /* 0x0e0fe40000005408 */ - /*0310*/ HMMA.884.F32.F32.STEP1 R10, R20.reuse.ROW, R12.reuse.COL, R10 ; /* 0x0000000c140a7236 */ - /* 0x0c0fe4000000d40a */ - /*0320*/ HMMA.884.F32.F32.STEP2 R4, R20.reuse.ROW, R12.reuse.COL, R4 ; /* 0x0000000c14047236 */ - /* 0x0c0fe40000015404 */ - /*0330*/ HMMA.884.F32.F32.STEP3 R6, R20.ROW, R12.COL, R6 ; /* 0x0000000c14067236 */ - /* 0x000f64000001d406 */ - /*0340*/ HMMA.884.F32.F32.STEP0 R8, R22.reuse.ROW, R14.reuse.COL, R8 ; /* 0x0000000e16087236 */ - /* 0x0e0b640000005408 */ - /*0350*/ HMMA.884.F32.F32.STEP1 R10, R22.reuse.ROW, R14.reuse.COL, R10 ; /* 0x0000000e160a7236 */ - /* 0x0c0b64000000d40a */ - /*0360*/ HMMA.884.F32.F32.STEP2 R4, R22.reuse.ROW, R14.reuse.COL, R4 ; /* 0x0000000e16047236 */ - /* 0x0c0b640000015404 */ - /*0370*/ HMMA.884.F32.F32.STEP3 R6, R22.ROW, R14.COL, R6 ; /* 0x0000000e16067236 */ - /* 0x000b74000001d406 */ - /*0380*/ CS2R R12, SR_CLOCKLO ; /* 0x00000000000c7805 */ - /* 0x000fd00000015000 */ - /*0390*/ SHF.L.U32 R2, R2, 0x2, RZ ; /* 0x0000000202027819 */ - /* 0x000fe200000006ff */ - /*03a0*/ IMAD.SHL.U32 R14, R3, 0x8, RZ ; /* 0x00000008030e7824 */ - /* 0x020fc600078e00ff */ - /*03b0*/ LOP3.LUT R29, R2, 0x4, R29, 0xe2, !PT ; /* 0x00000004021d7812 */ - /* 0x000fc800078ee21d */ - /*03c0*/ LOP3.LUT R3, R29.reuse, 0x2, RZ, 0xc0, !PT ; /* 0x000000021d037812 */ - /* 0x040fe400078ec0ff */ - /*03d0*/ LOP3.LUT R29, R29, 0x5, RZ, 0xc0, !PT ; /* 0x000000051d1d7812 */ - /* 0x000fe400078ec0ff */ - /*03e0*/ LEA R2, R0, R3, 0x3 ; /* 0x0000000300027211 */ - /* 0x000fe200078e18ff */ - /*03f0*/ IMAD.MOV.U32 R3, RZ, RZ, RZ ; /* 0x000000ffff037224 */ - /* 0x000fe200078e00ff */ - /*0400*/ LOP3.LUT R29, R14, 0x8, R29, 0xe2, !PT ; /* 0x000000080e1d7812 */ - /* 0x000fca00078ee21d */ - /*0410*/ IMAD.WIDE.U32 R2, R29, 0x10, R2 ; /* 0x000000101d027825 */ - /* 0x000fe200078e0002 */ - /*0420*/ MOV R29, 0x8 ; /* 0x00000008001d7802 */ - /* 0x000fca0000000f00 */ - /*0430*/ IMAD.WIDE.U32 R16, R28, R29, c[0x0][0x160] ; /* 0x000058001c107625 */ - /* 0x000fe200078e001d */ - /*0440*/ LEA R14, P0, R2, R32, 0x2 ; /* 0x00000020020e7211 */ - /* 0x000fc600078010ff */ - /*0450*/ IMAD.WIDE.U32 R28, R28, R29, c[0x0][0x168] ; /* 0x00005a001c1c7625 */ - /* 0x000fe200078e001d */ - /*0460*/ LEA.HI.X R15, R2, R33, R3, 0x2, P0 ; /* 0x00000021020f7211 */ - /* 0x000fd000000f1403 */ - /*0470*/ STG.E.64.SYS [R14], R8 ; /* 0x000000080e007386 */ - /* 0x000fe8000010eb00 */ - /*0480*/ STG.E.64.SYS [R14+0x10], R4 ; /* 0x000010040e007386 */ - /* 0x000fe8000010eb00 */ - /*0490*/ STG.E.64.SYS [R14+0x80], R10 ; /* 0x0000800a0e007386 */ - /* 0x000fe8000010eb00 */ - /*04a0*/ STG.E.64.SYS [R14+0x90], R6 ; /* 0x000090060e007386 */ - /* 0x000fe8000010eb00 */ - /*04b0*/ STG.E.64.SYS [R16], R30 ; /* 0x0000001e10007386 */ - /* 0x000fe8000010eb00 */ - /*04c0*/ STG.E.64.SYS [R28], R12 ; /* 0x0000000c1c007386 */ - /* 0x000fe2000010eb00 */ - /*04d0*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*04e0*/ BRA 0x4e0; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*04f0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - ............................................... - - - -Fatbin ptx code: -================ -arch = sm_70 -code version = [7,0] -producer = -host = linux -compile_size = 64bit -compressed - -Fatbin elf code: -================ -arch = sm_75 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_75 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj - .headerflags @"EF_CUDA_SM75 EF_CUDA_PTX_SM(EF_CUDA_SM75)" - /*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */ - /* 0x000fd00000000f00 */ - /*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e220000002100 */ - /*0020*/ MOV R19, RZ ; /* 0x000000ff00137202 */ - /* 0x000fe20000000f00 */ - /*0030*/ IMAD.MOV.U32 R7, RZ, RZ, 0x10 ; /* 0x00000010ff077424 */ - /* 0x000fe200078e00ff */ - /*0040*/ MOV R5, 0x2 ; /* 0x0000000200057802 */ - /* 0x000fe20000000f00 */ - /*0050*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e280000002500 */ - /*0060*/ S2R R2, SR_LANEID ; /* 0x0000000000027919 */ - /* 0x000e620000000000 */ - /*0070*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x001fe200078e0200 */ - /*0080*/ LOP3.LUT R18, R2, 0x3, RZ, 0xc0, !PT ; /* 0x0000000302127812 */ - /* 0x002fc600078ec0ff */ - /*0090*/ IMAD.SHL.U32 R14, R0, 0x1000, RZ ; /* 0x00001000000e7824 */ - /* 0x000fe200078e00ff */ - /*00a0*/ SHF.R.U32.HI R3, RZ, 0x2, R2 ; /* 0x00000002ff037819 */ - /* 0x000fca0000011602 */ - /*00b0*/ IMAD.WIDE.U32 R18, R3, 0x8, R18 ; /* 0x0000000803127825 */ - /* 0x000fc800078e0012 */ - /*00c0*/ IMAD.WIDE.U32 R2, R14, R5, c[0x0][0x170] ; /* 0x00005c000e027625 */ - /* 0x000fc600078e0005 */ - /*00d0*/ LEA R8, P1, R18, c[0x0][0x178], 0x2 ; /* 0x00005e0012087a11 */ - /* 0x000fc800078210ff */ - /*00e0*/ LEA R4, P0, R18.reuse, R2, 0x2 ; /* 0x0000000212047211 */ - /* 0x040fe400078010ff */ - /*00f0*/ LEA.HI.X R9, R18.reuse, c[0x0][0x17c], R19.reuse, 0x2, P1 ; /* 0x00005f0012097a11 */ - /* 0x140fe400008f1413 */ - /*0100*/ LEA.HI.X R5, R18, R3, R19, 0x2, P0 ; /* 0x0000000312057211 */ - /* 0x000fc600000f1413 */ - /*0110*/ IMAD.WIDE.U32 R10, R7, 0x10, R8 ; /* 0x00000010070a7825 */ - /* 0x000fc600078e0008 */ - /*0120*/ LDG.E.SYS R20, [R8] ; /* 0x0000000008147381 */ - /* 0x00012200001ee900 */ - /*0130*/ IMAD.WIDE.U32 R6, R7, 0x10, R4 ; /* 0x0000001007067825 */ - /* 0x000fc600078e0004 */ - /*0140*/ LDG.E.SYS R21, [R8+0x10] ; /* 0x0000100008157381 */ - /* 0x00012800001ee900 */ - /*0150*/ LDG.E.SYS R22, [R10] ; /* 0x000000000a167381 */ - /* 0x00012800001ee900 */ - /*0160*/ LDG.E.SYS R23, [R10+0x10] ; /* 0x000010000a177381 */ - /* 0x00012800001ee900 */ - /*0170*/ LDG.E.SYS R12, [R4] ; /* 0x00000000040c7381 */ - /* 0x00012800001ee900 */ - /*0180*/ LDG.E.SYS R2, [R4+0x10] ; /* 0x0000100004027381 */ - /* 0x00012800001ee900 */ - /*0190*/ LDG.E.SYS R13, [R6] ; /* 0x00000000060d7381 */ - /* 0x00012800001ee900 */ - /*01a0*/ LDG.E.SYS R3, [R6+0x10] ; /* 0x0000100006037381 */ - /* 0x00012200001ee900 */ - /*01b0*/ MOV R15, 0x4 ; /* 0x00000004000f7802 */ - /* 0x000fca0000000f00 */ - /*01c0*/ IMAD.WIDE.U32 R14, R14, R15, c[0x0][0x180] ; /* 0x000060000e0e7625 */ - /* 0x000fe200078e000f */ - /*01d0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fee0000000000 */ - /*01e0*/ CS2R R16, SR_CLOCKLO ; /* 0x0000000000107805 */ - /* 0x000fd00000015000 */ - /*01f0*/ CS2R R8, SRZ ; /* 0x0000000000087805 */ - /* 0x001fe2000001ff00 */ - /*0200*/ CS2R R10, SRZ ; /* 0x00000000000a7805 */ - /* 0x000fe2000001ff00 */ - /*0210*/ CS2R R4, SRZ ; /* 0x0000000000047805 */ - /* 0x000fe2000001ff00 */ - /*0220*/ CS2R R6, SRZ ; /* 0x0000000000067805 */ - /* 0x000fe2000001ff00 */ - /*0230*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fea0000000000 */ - /*0240*/ HMMA.1688.F32 R8, R12, R20, R8 ; /* 0x000000140c08723c */ - /* 0x010f700000001008 */ - /*0250*/ HMMA.1688.F32 R4, R12, R22, R4 ; /* 0x000000160c04723c */ - /* 0x000f700000001004 */ - /*0260*/ HMMA.1688.F32 R8, R2, R21, R8 ; /* 0x000000150208723c */ - /* 0x020b700000001008 */ - /*0270*/ HMMA.1688.F32 R4, R2, R23, R4 ; /* 0x000000170204723c */ - /* 0x000b5c0000001004 */ - /*0280*/ CS2R R20, SR_CLOCKLO ; /* 0x0000000000147805 */ - /* 0x020fd00000015000 */ - /*0290*/ LEA R2, P0, R18, R14, 0x3 ; /* 0x0000000e12027211 */ - /* 0x000fc800078018ff */ - /*02a0*/ LEA.HI.X R3, R18, R15, R19, 0x3, P0 ; /* 0x0000000f12037211 */ - /* 0x000fe200000f1c13 */ - /*02b0*/ IMAD.MOV.U32 R19, RZ, RZ, 0x8 ; /* 0x00000008ff137424 */ - /* 0x000fc800078e00ff */ - /*02c0*/ IMAD.WIDE.U32 R12, R19, 0x40, R2 ; /* 0x00000040130c7825 */ - /* 0x000fc600078e0002 */ - /*02d0*/ STG.E.64.SYS [R2], R8 ; /* 0x0000000802007386 */ - /* 0x000fe2000010eb00 */ - /*02e0*/ IMAD.WIDE.U32 R14, R0, R19, c[0x0][0x160] ; /* 0x00005800000e7625 */ - /* 0x000fc800078e0013 */ - /*02f0*/ IMAD.WIDE.U32 R18, R0, R19, c[0x0][0x168] ; /* 0x00005a0000127625 */ - /* 0x000fe400078e0013 */ - /*0300*/ STG.E.64.SYS [R12], R10 ; /* 0x0000000a0c007386 */ - /* 0x000fe8000010eb00 */ - /*0310*/ STG.E.64.SYS [R2+0x20], R4 ; /* 0x0000200402007386 */ - /* 0x000fe8000010eb00 */ - /*0320*/ STG.E.64.SYS [R12+0x20], R6 ; /* 0x000020060c007386 */ - /* 0x000fe8000010eb00 */ - /*0330*/ STG.E.64.SYS [R14], R16 ; /* 0x000000100e007386 */ - /* 0x000fe8000010eb00 */ - /*0340*/ STG.E.64.SYS [R18], R20 ; /* 0x0000001412007386 */ - /* 0x000fe2000010eb00 */ - /*0350*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*0360*/ BRA 0x360; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*0370*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - ............................................... - - - -Fatbin ptx code: -================ -arch = sm_75 -code version = [7,0] -producer = -host = linux -compile_size = 64bit -compressed - -Fatbin elf code: -================ -arch = sm_80 -code version = [1,7] -producer = -host = linux -compile_size = 64bit - - code for sm_80 - Function : _Z9max_flopsI6__halfEvPmS1_PT_S3_Pfj - .headerflags @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)" - /*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */ - /* 0x000fce0000000f00 */ - /*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */ - /* 0x000e220000002100 */ - /*0020*/ MOV R21, RZ ; /* 0x000000ff00157202 */ - /* 0x000fe20000000f00 */ - /*0030*/ IMAD.MOV.U32 R9, RZ, RZ, 0x10 ; /* 0x00000010ff097424 */ - /* 0x000fe200078e00ff */ - /*0040*/ MOV R5, 0x2 ; /* 0x0000000200057802 */ - /* 0x000fe20000000f00 */ - /*0050*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */ - /* 0x000e220000002500 */ - /*0060*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */ - /* 0x000fc60000000a00 */ - /*0070*/ S2R R2, SR_LANEID ; /* 0x0000000000027919 */ - /* 0x000e620000000000 */ - /*0080*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */ - /* 0x001fe200078e0200 */ - /*0090*/ LOP3.LUT R20, R2, 0x3, RZ, 0xc0, !PT ; /* 0x0000000302147812 */ - /* 0x002fc600078ec0ff */ - /*00a0*/ IMAD.SHL.U32 R18, R0, 0x1000, RZ ; /* 0x0000100000127824 */ - /* 0x000fe200078e00ff */ - /*00b0*/ SHF.R.U32.HI R3, RZ, 0x2, R2 ; /* 0x00000002ff037819 */ - /* 0x000fca0000011602 */ - /*00c0*/ IMAD.WIDE.U32 R20, R3, 0x8, R20 ; /* 0x0000000803147825 */ - /* 0x000fc800078e0014 */ - /*00d0*/ IMAD.WIDE.U32 R2, R18, R5, c[0x0][0x170] ; /* 0x00005c0012027625 */ - /* 0x000fc600078e0005 */ - /*00e0*/ LEA R10, P1, R20, c[0x0][0x178], 0x2 ; /* 0x00005e00140a7a11 */ - /* 0x000fc800078210ff */ - /*00f0*/ LEA R4, P0, R20.reuse, R2, 0x2 ; /* 0x0000000214047211 */ - /* 0x040fe400078010ff */ - /*0100*/ LEA.HI.X R11, R20.reuse, c[0x0][0x17c], R21.reuse, 0x2, P1 ; /* 0x00005f00140b7a11 */ - /* 0x140fe400008f1415 */ - /*0110*/ LEA.HI.X R5, R20, R3, R21, 0x2, P0 ; /* 0x0000000314057211 */ - /* 0x000fc600000f1415 */ - /*0120*/ IMAD.WIDE.U32 R16, R9.reuse, 0x10, R10 ; /* 0x0000001009107825 */ - /* 0x040fe200078e000a */ - /*0130*/ LDG.E R6, [R10.64] ; /* 0x000000040a067981 */ - /* 0x000126000c1e1900 */ - /*0140*/ IMAD.WIDE.U32 R8, R9, 0x10, R4 ; /* 0x0000001009087825 */ - /* 0x000fe200078e0004 */ - /*0150*/ LDG.E R7, [R10.64+0x10] ; /* 0x000010040a077981 */ - /* 0x000128000c1e1900 */ - /*0160*/ LDG.E R2, [R16.64] ; /* 0x0000000410027981 */ - /* 0x000128000c1e1900 */ - /*0170*/ LDG.E R3, [R16.64+0x10] ; /* 0x0000100410037981 */ - /* 0x000128000c1e1900 */ - /*0180*/ LDG.E R12, [R4.64] ; /* 0x00000004040c7981 */ - /* 0x000128000c1e1900 */ - /*0190*/ LDG.E R14, [R4.64+0x10] ; /* 0x00001004040e7981 */ - /* 0x000128000c1e1900 */ - /*01a0*/ LDG.E R13, [R8.64] ; /* 0x00000004080d7981 */ - /* 0x000128000c1e1900 */ - /*01b0*/ LDG.E R15, [R8.64+0x10] ; /* 0x00001004080f7981 */ - /* 0x000122000c1e1900 */ - /*01c0*/ MOV R19, 0x4 ; /* 0x0000000400137802 */ - /* 0x000fca0000000f00 */ - /*01d0*/ IMAD.WIDE.U32 R18, R18, R19, c[0x0][0x180] ; /* 0x0000600012127625 */ - /* 0x000fe200078e0013 */ - /*01e0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*01f0*/ CS2R R16, SR_CLOCKLO ; /* 0x0000000000107805 */ - /* 0x001fce0000015000 */ - /*0200*/ CS2R R8, SRZ ; /* 0x0000000000087805 */ - /* 0x000fe2000001ff00 */ - /*0210*/ CS2R R10, SRZ ; /* 0x00000000000a7805 */ - /* 0x000fe2000001ff00 */ - /*0220*/ CS2R R4, SRZ ; /* 0x0000000000047805 */ - /* 0x000fe2000001ff00 */ - /*0230*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */ - /* 0x000fec0000000000 */ - /*0240*/ HMMA.16816.F32 R8, R12, R6, R8 ; /* 0x000000060c08723c */ - /* 0x010b6e0000001808 */ - /*0250*/ CS2R R6, SRZ ; /* 0x0000000000067805 */ - /* 0x020fce000001ff00 */ - /*0260*/ HMMA.16816.F32 R4, R12, R2, R4 ; /* 0x000000020c04723c */ - /* 0x000b5e0000001804 */ - /*0270*/ NOP ; /* 0x0000000000007918 */ - /* 0x000fd20000000000 */ - /*0280*/ CS2R R2, SR_CLOCKLO ; /* 0x0000000000027805 */ - /* 0x020fce0000015000 */ - /*0290*/ LEA R12, P0, R20, R18, 0x3 ; /* 0x00000012140c7211 */ - /* 0x000fc800078018ff */ - /*02a0*/ LEA.HI.X R13, R20, R19, R21, 0x3, P0 ; /* 0x00000013140d7211 */ - /* 0x000fe200000f1c15 */ - /*02b0*/ IMAD.MOV.U32 R21, RZ, RZ, 0x8 ; /* 0x00000008ff157424 */ - /* 0x000fc800078e00ff */ - /*02c0*/ IMAD.WIDE.U32 R18, R21, 0x40, R12 ; /* 0x0000004015127825 */ - /* 0x000fe200078e000c */ - /*02d0*/ STG.E.64 [R12.64], R8 ; /* 0x000000080c007986 */ - /* 0x000fe6000c101b04 */ - /*02e0*/ IMAD.WIDE.U32 R14, R0, R21, c[0x0][0x160] ; /* 0x00005800000e7625 */ - /* 0x000fc600078e0015 */ - /*02f0*/ STG.E.64 [R18.64], R10 ; /* 0x0000000a12007986 */ - /* 0x000fe2000c101b04 */ - /*0300*/ IMAD.WIDE.U32 R20, R0, R21, c[0x0][0x168] ; /* 0x00005a0000147625 */ - /* 0x000fc600078e0015 */ - /*0310*/ STG.E.64 [R12.64+0x20], R4 ; /* 0x000020040c007986 */ - /* 0x000fe8000c101b04 */ - /*0320*/ STG.E.64 [R18.64+0x20], R6 ; /* 0x0000200612007986 */ - /* 0x000fe8000c101b04 */ - /*0330*/ STG.E.64 [R14.64], R16 ; /* 0x000000100e007986 */ - /* 0x000fe8000c101b04 */ - /*0340*/ STG.E.64 [R20.64], R2 ; /* 0x0000000214007986 */ - /* 0x000fe2000c101b04 */ - /*0350*/ EXIT ; /* 0x000000000000794d */ - /* 0x000fea0003800000 */ - /*0360*/ BRA 0x360; /* 0xfffffff000007947 */ - /* 0x000fc0000383ffff */ - /*0370*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0380*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*0390*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03a0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03b0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03c0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03d0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03e0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - /*03f0*/ NOP; /* 0x0000000000007918 */ - /* 0x000fc00000000000 */ - ............................................... - - - -Fatbin ptx code: -================ -arch = sm_80 -code version = [7,0] -producer = -host = linux -compile_size = 64bit -compressed diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu deleted file mode 100644 index 79a2b739e..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu +++ /dev/null @@ -1,19 +0,0 @@ -#include "tensor_bw_half.h" - -int main() { - - intilizeDeviceProp(0); - - if (deviceProp.major < 6) // tesnore unit was added since Volta - return 1; - - std::cout << "FP16 operand, FP32 accumalte:\n"; - tensor_max_flops(); - - std::cout << "\nFP16 operand, FP16 accumalte:\n"; - tensor_max_flops(); - - // tensor_max_flops(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h deleted file mode 100644 index 2064c4df4..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef MAXFLOPS_TENSOR_DEF_H -#define MAXFLOPS_TENSOR_DEF_H - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 2048 -#define WMMA_M 16 -#define WMMA_N 16 -#define WMMA_K 16 -#define A_SIZE WMMA_M *WMMA_K -#define B_SIZE WMMA_N *WMMA_K -#define R_SIZE WMMA_M *WMMA_N - -using namespace nvcuda; - -template -__global__ void -max_flops(uint64_t *startClk, uint64_t *stopClk, T *a, T *b, R *res, - uint32_t strid) { // strid set to 0 used to prevent optimization - - // thread index - uint32_t tid = threadIdx.x; - uint32_t gid = blockIdx.x * blockDim.x + tid; - uint32_t warpid = gid / warpSize; - - a = a + warpid * A_SIZE; - b = b + warpid * B_SIZE; - res = res + warpid * R_SIZE; - - wmma::fragment - a_frag; - wmma::fragment - b_frag; - wmma::fragment c_frag; - - wmma::load_matrix_sync(a_frag, a, 16); - wmma::fill_fragment(c_frag, 0.0f); - wmma::load_matrix_sync(b_frag, b, 16); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - for (int j = 0; j < REPEAT_TIMES; ++j) { - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - wmma::store_matrix_sync(res, c_frag, WMMA_N, wmma::mem_row_major); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; -} - -template float tensor_max_flops(bool report_fma_bw = false) { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - unsigned total_A_SIZE = - A_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp - unsigned total_B_SIZE = - B_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp - unsigned total_R_SIZE = - R_SIZE * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); - T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); - R *res = (R *)malloc(total_R_SIZE * sizeof(R)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - T *data1_g; - T *data2_g; - R *res_g; - - for (uint32_t i = 0; i < A_SIZE; i++) { - data1[i] = (T)i; - } - - for (uint32_t i = 0; i < B_SIZE; i++) { - data2[i] = (T)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); - gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); - gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); - - gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), - cudaMemcpyHostToDevice)); - - max_flops<<>>( - startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk( - cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); - - float wmma_bw, hmma_bw, fma_bw; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - wmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS)) / (float)total_time; - hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / - (float)total_time; - fma_bw = ((float)(REPEAT_TIMES * WMMA_M * WMMA_N * WMMA_K * - (TOTAL_THREADS / WARP_SIZE))) / - (float)total_time; - - std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; - std::cout << "hmma SASS issue bandwidth = " << hmma_bw << "(thread/clk/SM)\n"; - std::cout << "FMA tensor bandwidth = " << fma_bw << "(FMA/clk/SM)\n"; - - std::cout << "Total Clk number = " << total_time << "\n"; - - if (report_fma_bw) - return fma_bw; - else - return wmma_bw; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile deleted file mode 100644 index 03b7e7ec8..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -GENCODE_SM50 := -GENCODE_SM61 := -GENCODE_SM30 := -GENCODE_SM35 := -GENCODE_SM60 := -GENCODE_SM62 := - -SRC = tensor_lat_half.cu - -EXE = tensor_lat_half - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu deleted file mode 100644 index 32245f805..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu +++ /dev/null @@ -1,19 +0,0 @@ -#include "tensor_lat_half.h" - -int main() { - - intilizeDeviceProp(0); - - if (deviceProp.major < 6) // tesnore unit was added since Volta - return 1; - - std::cout << "FP16 operand, FP32 accumalte:\n"; - tensor_lat(); - - std::cout << "\nFP16 operand, FP16 accumalte:\n"; - tensor_lat(); - - // tensor_lat(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h b/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h deleted file mode 100644 index 5e8c3cecc..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef LAT_TENSOR_DEF_H -#define LAT_TENSOR_DEF_H - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_ITERS 4096 - -#define M_SIZE 16 * 16 - -using namespace nvcuda; - -template -__global__ void tensor_latency(uint64_t *startClk, uint64_t *stopClk, T *a, - T *b, R *res) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - - // register T result = 0; - - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment c_frag; - - wmma::load_matrix_sync(a_frag, a, 16); - wmma::fill_fragment(c_frag, 0.0f); - wmma::load_matrix_sync(b_frag, b, 16); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - for (int j = 0; j < REPEAT_ITERS; ++j) { - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - wmma::store_matrix_sync(res, c_frag, 16, wmma::mem_row_major); - - // write time and data back to memory - startClk[gid] = start; - stopClk[gid] = stop; -} - -template float tensor_lat() { - - intilizeDeviceProp(0); - - THREADS_PER_BLOCK = 1; - THREADS_PER_SM = 1; - BLOCKS_NUM = 1; - TOTAL_THREADS = 1; - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - T *data1 = (T *)malloc(M_SIZE * sizeof(T)); - T *data2 = (T *)malloc(M_SIZE * sizeof(T)); - R *res = (R *)malloc(TOTAL_THREADS * sizeof(R)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - T *data1_g; - T *data2_g; - R *res_g; - - for (uint32_t i = 0; i < M_SIZE; i++) { - data1[i] = (T)i; - data2[i] = (T)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&data1_g, M_SIZE * sizeof(T))); - gpuErrchk(cudaMalloc(&data2_g, M_SIZE * sizeof(T))); - gpuErrchk(cudaMalloc(&res_g, TOTAL_THREADS * sizeof(R))); - - gpuErrchk( - cudaMemcpy(data1_g, data1, M_SIZE * sizeof(T), cudaMemcpyHostToDevice)); - gpuErrchk( - cudaMemcpy(data2_g, data2, M_SIZE * sizeof(T), cudaMemcpyHostToDevice)); - - tensor_latency<<>>( - startClk_g, stopClk_g, data1_g, data2_g, res_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - // gpuErrchk( cudaMemcpy(res, res_g, M_SIZE*sizeof(R), cudaMemcpyDeviceToHost) - // ); - - float wmma, hmma; - uint64_t total_time = stopClk[0] - startClk[0]; - wmma = ((float)(total_time)) / ((float)(REPEAT_ITERS)); - hmma = - ((float)(total_time)) / ((float)(REPEAT_ITERS * SASS_hmma_per_PTX_wmma)); - - std::cout << "wmma latency = " << wmma << "(clk)\n"; - std::cout << "hmma latency = " << hmma << "(clk)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return wmma; -} - -#endif diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile deleted file mode 100644 index 2477bc631..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l1_access_grain.cu - -EXE = l1_access_grain - -NVCC_FLGAS = -Xptxas -dlcm=ca - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu deleted file mode 100644 index ba1ee1524..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_access_grain/l1_access_grain.cu +++ /dev/null @@ -1,105 +0,0 @@ -/* -This benchmark measures L1 coalescing granularity for differnet strides -check the nvprof or nvsight for received l1 reads and writes -for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf - -run the program with nsight - make nvsight ./l1_access_grain - -*/ - -#include -#include -#include - - -using namespace std; - -#include "../../../hw_def/hw_def.h" - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Device code -__global__ void l1_stride_cons(const float *A, float *C, int stride) - -{ - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - C[i * stride] = A[i * stride]; -} - -__global__ void l1_stride(const float *A, float *C, int stride) - -{ - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - C[((i / stride) * 32) + (i % stride)] = A[((i / stride) * 32) + (i % stride)]; -} - -// Host code -void coaslescer_stride(int N, int threadsPerBlock, int stride) { - // Variables - float *h_A; - float *h_C; - - float *d_A; - float *d_C; - - size_t size = N * sizeof(float) * 32; - - // Allocate input vectors h_A and h_B in host memory - h_A = (float *)malloc(size); - h_C = (float *)malloc(size); - - // Initialize input vectors - for (uint32_t i = 0; i < N; i++) - h_A[i] = (float)i; - - // Allocate vectors in device memory - gpuErrchk(cudaMalloc((void **)&d_A, size)); - gpuErrchk(cudaMalloc((void **)&d_C, size)); - - // Copy vectors from host memory to device memory - gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); - - // Invoke kernel - int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock); - - l1_stride<<>>(d_A, d_C, stride); - gpuErrchk(cudaPeekAtLastError()); - - // Copy result from device memory to host memory - // h_C contains the result in host memory - gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); - - // Free device memory - if (d_A) - cudaFree(d_A); - if (d_C) - cudaFree(d_C); - - // Free host memory - if (h_A) - free(h_A); - if (h_C) - free(h_C); -} -////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - intilizeDeviceProp(0); - - for (int i = 1; i <= WARP_SIZE; ++i) { - coaslescer_stride(WARP_SIZE, WARP_SIZE, i); - } - - std::cout << "\nThis benchmark measures coalescing granularity for differnet " - "strides.\n"; - std::cout - << "check the nvprof or nvsight for received l1 reads and writes.\n"; - std::cout - << "to run the program with nsight: make nvsight ./l1_access_grain\n"; - std::cout - << "stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & " - "l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum\n\n"; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile deleted file mode 100644 index eede7448b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_adaptive.cu - -EXE = l1_adaptive - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu deleted file mode 100644 index 3e7bb8ea4..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_adaptive/l1_adaptive.cu +++ /dev/null @@ -1,21 +0,0 @@ -// Is L1 sector? - -#include -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -__global__ void l1_adaptive() {} - -int main() { - intilizeDeviceProp(0); - - // TO DO - std::cout << "The ubench is not imepleneted yet.\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile deleted file mode 100644 index 9447eef75..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = l1_associativity.cu - -EXE = l1_associativity - -#NVCC_FLGAS = -Xptxas -O0 - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu deleted file mode 100644 index dbc4e9e1c..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu +++ /dev/null @@ -1,187 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -class chaserParam { -public: - uint32_t stride, array_size, iteration, l1_cache_size; - int shared_mem_size_byte; - bool sequential; - - uint64_t start, stop; -}; - -__global__ void setup_kernel(curandStateMRG32k3a *state) { - int id = 0; - - curand_init(1234, id, 0, &state[id]); -} - -__global__ void l1_squential(uint64_t *startCLK, uint64_t *stopCLK, - uint32_t *dsink, uint32_t *posArray, - uint32_t stride, uint32_t array_size, - uint32_t iteration) { - // uint32_t tid = threadIdx.x; - // uint32_t bid = blockIdx.x; - // uint32_t uid = bid*blockDim.x+tid; - // uint32_t n_threads = blockDim.x * gridDim.x; - - uint64_t start, stop; - uint32_t pointer; - pointer = 0; - - for (int itr = 0; itr < iteration; itr++) { - start = clock64(); - for (uint32_t i = 0; i < (array_size / stride); i++) { - pointer = posArray[pointer]; - } - stop = clock64(); - } - - startCLK[0] = start; - stopCLK[0] = stop; - dsink[0] = pointer; -} - -__global__ void l1_random(uint64_t *startCLK, uint64_t *stopCLK, - uint32_t *dsink, uint32_t *posArray, - curandStateMRG32k3a *state, uint32_t stride, - uint32_t array_size, uint32_t iteration) { - // uint32_t tid = threadIdx.x; - // uint32_t bid = blockIdx.x; - // uint32_t uid = bid*blockDim.x+tid; - // uint32_t n_threads = blockDim.x * gridDim.x; - uint64_t start, stop; - uint32_t pointer; - pointer = 0; - - for (int itr = 0; itr < iteration; itr++) { - start = clock64(); - for (uint32_t i = 0; i < (array_size / stride); i++) { - pointer = - posArray[(pointer + curand(state)) % array_size / stride * stride]; - } - stop = clock64(); - } - - startCLK[0] = start; - stopCLK[0] = stop; - dsink[0] = pointer; -} - -void l1_structure(chaserParam &chaser) { - - uint64_t *startCLK = (uint64_t *)malloc(1 * sizeof(uint64_t)); - uint64_t *stopCLK = (uint64_t *)malloc(1 * sizeof(uint64_t)); - uint32_t *dsink = (uint32_t *)malloc(1 * sizeof(uint32_t)); - uint32_t *posArray = (uint32_t *)malloc(chaser.array_size * sizeof(uint32_t)); - - for (uint32_t i = 0; i < chaser.array_size; i++) - posArray[i] = (i + chaser.stride) % chaser.array_size; - - uint64_t *startCLK_g; - uint64_t *stopCLK_g; - uint32_t *dsink_g; - uint32_t *posArray_g; - - gpuErrchk(cudaMalloc(&startCLK_g, 1 * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopCLK_g, 1 * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, 1 * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, chaser.array_size * sizeof(uint32_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, - chaser.array_size * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - if (chaser.sequential) { - l1_squential<<<1, 1>>>(startCLK_g, stopCLK_g, dsink_g, posArray_g, - chaser.stride, chaser.array_size, chaser.iteration); - } else { - curandStateMRG32k3a *devMRGStates; - gpuErrchk(cudaMalloc((void **)&devMRGStates, sizeof(curandStateMRG32k3a))); - setup_kernel<<<1, 1>>>(devMRGStates); - l1_random<<<1, 1>>>(startCLK_g, stopCLK_g, dsink_g, posArray_g, - devMRGStates, chaser.stride, chaser.array_size, - chaser.iteration); - } - - // gpuErrchk( cudaPeekAtLastError() ); - - gpuErrchk(cudaMemcpy(startCLK, startCLK_g, 1 * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopCLK, stopCLK_g, 1 * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk( - cudaMemcpy(dsink, dsink_g, 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); - - chaser.start = startCLK[0]; - chaser.stop = stopCLK[0]; - - free(startCLK); - free(stopCLK); - free(dsink); - free(posArray); - gpuErrchk(cudaFree(startCLK_g)); - gpuErrchk(cudaFree(stopCLK_g)); - gpuErrchk(cudaFree(dsink_g)); - gpuErrchk(cudaFree(posArray_g)); - - return; -} - -int main() { - intilizeDeviceProp(0); - - std::cout << "Launching L1 cache line size ubench" << std::endl; - std::ostringstream oss; - oss << "L1line.csv"; - std::string filename = oss.str(); - std::ofstream myfile1(filename); - - chaserParam chaser1; - chaser1.shared_mem_size_byte = 0; - chaser1.iteration = 1; - chaser1.array_size = L1_SIZE / 4; - chaser1.sequential = true; - - myfile1 << "chaser.stride,chaser.start,chaser.stop\n"; - for (uint32_t i = 1; i <= 32; i *= 2) { - chaser1.stride = i; - l1_structure(chaser1); - myfile1 << chaser1.stride << "," << chaser1.start << "," << chaser1.stop - << "\n"; - } - - std::cout << "Saving L1 cache line size data at L1line.csv" << std::endl; - - std::cout << "Launching L1 cache assoc ubench" << std::endl; - std::ostringstream string; - string << "L1asso.csv"; - filename = string.str(); - std::ofstream myfile2(filename); - - chaser1.iteration = 2; - chaser1.sequential = false; - // chaser1.array_size=L1_SIZE*8; //4096KB 32xl1size - myfile2 << "chaser.stride,chaser.start,chaser.stop\n"; - for (uint32_t i = 8; i <= 128; i *= 2) { - chaser1.stride = i; - chaser1.array_size = L1_SIZE / 16 * i; - l1_structure(chaser1); - myfile2 << chaser1.stride << "," << chaser1.start << "," << chaser1.stop - << "\n"; - } - - std::cout << "Saving L1 cache assoc data at L1asso.csv" << std::endl; - myfile1.close(); - myfile2.close(); - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx deleted file mode 100644 index 72e57db36..000000000 Binary files a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/linesize.xlsx and /dev/null differ diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile deleted file mode 100644 index 726f7eecd..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_banks.cu - -EXE = l1_banks - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu deleted file mode 100644 index 5b0f388dc..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_banks/l1_banks.cu +++ /dev/null @@ -1,21 +0,0 @@ -// Is L1 sector? - -#include -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -__global__ void l1_banks() {} - -int main() { - intilizeDeviceProp(0); - - // TO DO - std::cout << "The ubench is not imepleneted yet.\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile deleted file mode 100644 index bc762132f..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_bw_128.cu - -EXE = l1_bw_128 - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu deleted file mode 100644 index 3bc7d3efd..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu +++ /dev/null @@ -1,137 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L1 cache for 32 bit - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 256 -// array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) - -__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, - float *posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - float sink1 = 0; - float sink2 = 0; - float sink3 = 0; - - // warp up L1 cache - for (uint32_t i = tid * 4; i < ARRAY_SIZE; i += blockDim.x * 4) { - float *ptr = posArray + i; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f32 data<4>;\n\t" - "ld.global.ca.v4.f32 {data0,data1,data2,data3}, [%4];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "add.f32 %2, data2, %2;\n\t" - "add.f32 %3, data3, %3;\n\t" - "}" - : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t j = 0; j < REPEAT_TIMES; j++) { - float *ptr = posArray + ((tid * 4 + (j * warpSize * 4)) % ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f32 data<4>;\n\t" - "ld.global.ca.v4.f32 {data0,data1,data2,data3}, [%4];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "add.f32 %2, data2, %2;\n\t" - "add.f32 %3, data3, %3;\n\t" - "}" - : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1 + sink2 + sink3; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - // ARRAY_SIZE has to be less than L1_SIZE - assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - float *posArray_g; - float *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - - l1_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - double bw, BW; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(float) * 4) / - ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile deleted file mode 100644 index 51c68cf48..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -SRC = l1_bw_32f.cu - -EXE = l1_bw_32f - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu deleted file mode 100644 index 6abaac82b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu +++ /dev/null @@ -1,142 +0,0 @@ -/* This code is a modification of L1 cache benchmark from -"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": - https://arxiv.org/pdf/1804.06826.pdf - - This benchmark measures the maximum read bandwidth of L1 cache for 64 bit -*/ - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 256 -// array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE L1_SIZE / 8 - -__global__ void l1_bw(uint64_t *__restrict__ startClk, - uint64_t *__restrict__ stopClk, float *__restrict__ dsink, - const float *__restrict__ posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - float sink1 = 0; - float sink2 = 0; - float sink3 = 0; - - // populate l1 cache to warm up - for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) { - // float* ptr = &posArray[i]; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.ca.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(&posArray[i]) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t j = 0; j < REPEAT_TIMES; j++) { - // float* ptr = posArray + ((tid + (j*warpSize*4))%ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f32 data<4>;\n\t" - "ld.global.ca.f32 data0, [%4+0];\n\t" - "ld.global.ca.f32 data1, [%4+128];\n\t" - "ld.global.ca.f32 data2, [%4+256];\n\t" - "ld.global.ca.f32 data3, [%4+384];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "add.f32 %2, data2, %2;\n\t" - "add.f32 %3, data3, %3;\n\t" - "}" - : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3) - : "l"(&posArray[(tid + (j * warpSize * 4)) % ARRAY_SIZE]) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1 + sink2 + sink3; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - assert(ARRAY_SIZE * sizeof(float) < - L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - float *posArray_g; - float *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - l1_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float bw, BW; - uint64_t total_time; - total_time = *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - // total_time = stopClk[0]-startClk[0]; - bw = (float)(REPEAT_TIMES * THREADS_PER_SM * 4 * 4) / ((float)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile deleted file mode 100644 index 487ecccfc..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_bw_32f_unroll.cu - -EXE = l1_bw_32f_unroll - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu deleted file mode 100644 index b2be1f138..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu +++ /dev/null @@ -1,127 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L1 cache for 64 bit - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -// array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE L1_SIZE / 8 -#define REPEAT_TIMES 1024 - -__global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, float *dsink, - float *posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - float sink1 = 0; - float sink2 = 0; - float sink3 = 0; - - // populate l1 cache to warm up - for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) { - float *ptr = posArray + i; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.ca.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t j = 0; j < REPEAT_TIMES; j++) { - float *ptr = posArray + ((tid + (j * warpSize)) % ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.ca.f32 data, [%1+0];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1 + sink2 + sink3; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - assert(ARRAY_SIZE * sizeof(float) < - L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - float *posArray_g; - float *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - l1_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float bw; - bw = (float)(REPEAT_TIMES * THREADS_PER_SM * 4) / - ((float)(stopClk[0] - startClk[0])); - printf("L1 bandwidth = %f (byte/clk/SM)\n", bw); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile deleted file mode 100644 index 67df7821a..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_bw_64f.cu - -EXE = l1_bw_64f - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu deleted file mode 100644 index 78097b9fc..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu +++ /dev/null @@ -1,135 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L1 cache for 64 bit - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 256 -// array size is half the L1 size (2) * double size (8) -#define ARRAY_SIZE (L1_SIZE / 16) - -__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, double *dsink, - double *posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - double sink0 = 0; - double sink1 = 0; - - // populate l1 cache to warm up - for (uint32_t i = tid; i < ARRAY_SIZE; i += blockDim.x) { - double *ptr = posArray + i; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f64 data;\n\t" - "ld.global.ca.f64 data, [%1];\n\t" - "add.f64 %0, data, %0;\n\t" - "}" - : "+d"(sink0) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t j = 0; j < REPEAT_TIMES; j++) { - double *ptr = posArray + ((tid + (j * warpSize * 2)) % ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f64 data<2>;\n\t" - "ld.global.ca.f64 data0, [%2+0];\n\t" - "ld.global.ca.f64 data1, [%2+256];\n\t" - "add.f64 %0, data0, %0;\n\t" - "add.f64 %1, data1, %1;\n\t" - "}" - : "+d"(sink0), "+d"(sink1) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1; -} - -int main() { - - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - // ARRAY_SIZE has to be less than L1_SIZE - assert(ARRAY_SIZE * sizeof(double) < L1_SIZE); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - double *posArray = (double *)malloc(ARRAY_SIZE * sizeof(double)); - double *dsink = (double *)malloc(TOTAL_THREADS * sizeof(double)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - double *posArray_g; - double *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (double)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(double))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(double))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(double), - cudaMemcpyHostToDevice)); - - l1_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(double), - cudaMemcpyDeviceToHost)); - - double bw, BW; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(double) * 2) / - ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile deleted file mode 100644 index 31b27b0b1..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_bw_64v.cu - -EXE = l1_bw_64v - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu deleted file mode 100644 index 8f8b13ff0..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* This code is a modification of L1 cache benchmark from -"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": - https://arxiv.org/pdf/1804.06826.pdf - -This benchmark measures the maximum read bandwidth of L1 cache for 64-bit vector -*/ - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 256 -// array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) - -__global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, - float *posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - float sink1 = 0; - - // populate l1 cache to warm up - for (uint32_t i = tid * 2; i < ARRAY_SIZE; i += blockDim.x * 2) { - float *ptr = posArray + i; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f32 data<2>;\n\t" - "ld.global.ca.v2.f32 {data0,data1}, [%2];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "}" - : "+f"(sink0), "+f"(sink1) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t j = 0; j < REPEAT_TIMES; j++) { - float *ptr = posArray + ((tid * 2 + (j * warpSize * 2)) % ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f32 data<2>;\n\t" - "ld.global.ca.v2.f32 {data0,data1}, [%2];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "}" - : "+f"(sink0), "+f"(sink1) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1; -} - -int main() { - - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - // ARRAY_SIZE has to be less than L1_SIZE - assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - float *posArray_g; - float *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - l1_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - double bw, BW; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - bw = (double)(REPEAT_TIMES * THREADS_PER_SM * sizeof(float) * 2) / - ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile deleted file mode 100644 index 634228307..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_config.cu - -EXE = l1_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu deleted file mode 100644 index 0b3bc9376..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -/* -We know the below information from running our ubench, we copy and paste the -ubench results below manullay -TODO: we will automate this process -*/ - -// We know cache line size from l1_assoc ubench -#define L1_CACHE_LINE_SIZE 128 - -// We know #sets from l1_assoc ubench (the l1 cache has 4 sets, since kepler -// and in volta and turing) -#define L1_CACHE_SETS 4 - -// we know sector size from l1_assoc and l1_acces_grain ubenches and has -// been consistent over generations, change it accordingly -#define L1_SECTOR_SIZE 32 - -// we know the mshr throughput from l1_mshr ubench -// we find that each warp can issue up to two pending cache lines (8 sector -// reqs) -#define L1_ACCESS_FACTOR L1_CACHE_LINE_SIZE / L1_SECTOR_SIZE -#define L1_MSHR_ENTRIES_PER_WARP L1_ACCESS_FACTOR * 2 - -// L1 cache cache in Volta and above is write allocate, subsector write, write- -// through we know that from l1_write_policy ubench and has been consistent -// after Volta. Change it accordingly if it changes in new generations -static const char *After_Volta_L1_Cache_Write_Policy = ",L:T:m:L:L,"; - -// L1 cache bfore Volta was write-no-allocate, write-evict with only local -// accsses to be write-back -static const char *Before_Volta_L1_Cache_Write_Policy = ",L:L:m:N:L,"; - -// Adaptive cache config option -static const char *SHMEM_ADAPTIVE_OPTION = "0,8,16,32,64"; - -int main() { - intilizeDeviceProp(0); - - if (ACCEL_SIM_MODE) { - - std::cout << "\n//Accel_Sim config: \n"; - - bool adaptive_cache; - string cache_write_string; - string adaptive_shmem_option_string; - unsigned write_cache_ratio; - unsigned unified_l1d_size_inKB; - unsigned config_l1_size; - // l1 cache is sector since pascal - char is_sector = (deviceProp.major >= 6) ? 'S' : 'N'; - // for volta and above, l1 is write allocate and adative - if (deviceProp.major >= 7) { - // configure based on min l1 cache - // l1 cache is adpative - adaptive_cache = true; - adaptive_shmem_option_string = SHMEM_ADAPTIVE_OPTION; - std::stringstream large_shmem_size; - unsigned shd_mem_inKB = deviceProp.sharedMemPerMultiprocessor / 1024; - large_shmem_size << "," << shd_mem_inKB; - adaptive_shmem_option_string += large_shmem_size.str(); - unified_l1d_size_inKB = L1_SIZE / 1024; - //increase unified cache by 32KB in case the shd is larger - //this case happens in Turing, we need to write ubench to get the exact size - if(unified_l1d_size_inKB <= shd_mem_inKB) - unified_l1d_size_inKB = unified_l1d_size_inKB + 32; - // set l1 write allocation policy (write allocate, write through) - cache_write_string = After_Volta_L1_Cache_Write_Policy; - // L1 write-to-read ratio (25%) based on rodinia kmeans workload - // benchmarking - write_cache_ratio = 25; - //always configure l1 as 32KB in adaptive cache - //accel-sim will adjust the assoc adpatively during run-time - config_l1_size = 32*1024; - //ensure unified cache is multiple of l1 cache size - assert((unified_l1d_size_inKB*1024) % config_l1_size == 0); - } else { - adaptive_cache = false; - cache_write_string = Before_Volta_L1_Cache_Write_Policy; - write_cache_ratio = 0; - unified_l1d_size_inKB = L1_SIZE / 1024; - config_l1_size = L1_SIZE; - } - - // lines per set - unsigned assoc = config_l1_size / L1_CACHE_LINE_SIZE / L1_CACHE_SETS; - - unsigned warps_num_per_sm = MAX_THREADS_PER_SM / WARP_SIZE; - // each warp can issue up to two pending cache lines (this is based on our - // l1_mshr ubench) - unsigned mshr = warps_num_per_sm * L1_MSHR_ENTRIES_PER_WARP; - - std::cout << "-gpgpu_adaptive_cache_config " << adaptive_cache << std::endl; - std::cout << "-gpgpu_shmem_option " << adaptive_shmem_option_string - << std::endl; - std::cout << "-gpgpu_unified_l1d_size " << unified_l1d_size_inKB << std::endl; - std::cout << "-gpgpu_l1_banks " << WARP_SCHEDS_PER_SM << std::endl; - std::cout << "-gpgpu_cache:dl1 " << is_sector << ":" << L1_CACHE_SETS << ":" - << L1_CACHE_LINE_SIZE << ":" << assoc << cache_write_string - << "A:" << mshr << ":" << warps_num_per_sm << ",16:0,32" - << std::endl; - std::cout << "-gpgpu_gmem_skip_L1D " << !deviceProp.globalL1CacheSupported - << std::endl; - std::cout << "-gpgpu_l1_cache_write_ratio " << write_cache_ratio - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile deleted file mode 100644 index 31a4026db..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -SRC = l1_lat.cu - -EXE = l1_lat - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu deleted file mode 100644 index 2ccae4806..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include "l1_lat.h" - -int main() { - - intilizeDeviceProp(0); - - float lat = l1_lat(); - - if (ACCEL_SIM_MODE) { - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_l1_latency " << (unsigned)lat << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h deleted file mode 100644 index 408a50c54..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h +++ /dev/null @@ -1,119 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the latency of L1 cache - -#include -#include -#include -#include - -#include - -#include "../../../hw_def/hw_def.h" - -// Launch only one thread to calcaulte the latency using a pointer-chasing -// array technique -#define THREADS_NUM 1 -#define REPEAT_TIMES 32768 // iterate over the array ITERS times -#define ARRAY_SIZE 4096 // size of the array - -// Measure latency of ITERS reads. -__global__ void l1_lat(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) { - - // thread index - uint32_t tid = threadIdx.x; - - // one thread to initialize the pointer-chasing array - if (tid == 0) { - for (uint32_t i = 0; i < (ARRAY_SIZE - 1); i++) - posArray[i] = (uint64_t)(posArray + i + 1); - - posArray[ARRAY_SIZE - 1] = (uint64_t)posArray; - } - - if (tid < THREADS_NUM) { - // a register to avoid compiler optimization - uint64_t *ptr = posArray + tid; - uint64_t ptr1, ptr0; - - // initialize the thread pointer with the start address of the array - // use ca modifier to cache the in L1 - asm volatile("{\t\n" - "ld.global.ca.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr1) - : "l"(ptr) - : "memory"); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // pointer-chasing ITERS times - // use ca modifier to cache the load in L1 - for (uint32_t i = 0; i < REPEAT_TIMES; ++i) { - asm volatile("{\t\n" - "ld.global.ca.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr0) - : "l"((uint64_t *)ptr1) - : "memory"); - ptr1 = ptr0; // swap the register for the next load - } - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[tid] = start; - stopClk[tid] = stop; - dsink[tid] = ptr1; - } -} - -float l1_lat() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; - THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; - - assert(ARRAY_SIZE * sizeof(uint64_t) < L1_SIZE); - - uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *posArray_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t))); - - l1_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - - float lat = (float)(stopClk[0] - startClk[0]) / REPEAT_TIMES; - printf("L1 Latency = %12.4f cycles\n", lat); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return lat; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile deleted file mode 100644 index 9fdfa43be..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_mshr.cu - -EXE = l1_mshr - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu deleted file mode 100644 index 8d0418fda..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu +++ /dev/null @@ -1,147 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -__global__ void l1_mshr(uint64_t *timing, uint32_t *dsink, uint32_t *posArray, - uint32_t stride, uint64_t array_size, - uint32_t iteration) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - // uint32_t n_threads = blockDim.x * gridDim.x; - // uint32_t record_length = MAX_SHARED_MEM_SIZE_PER_BLOCK/8; - - extern __shared__ uint32_t t_val[]; // size of shared memory - - uint32_t pointer; - pointer = uid * 1024; - asm volatile("bar.sync 0;"); - - // pointer chasing - for (int itr = 0; itr < iteration; itr++) { - pointer = posArray[pointer]; - t_val[uid * iteration + itr] = clock64(); - } - // pointer chasing completed - for (uint32_t i = 0; i < iteration; i++) { - timing[uid * iteration + i] = t_val[uid * iteration + i]; - } - - dsink[uid] = pointer; -} - -void l1_structure(uint32_t stride, uint64_t array_size, - int shared_mem_size_byte, uint32_t iteration) { - - std::ostringstream oss; - oss << "MSHR" << stride << "_array" << array_size << "_shmem" - << (shared_mem_size_byte / 4) << "_itr" << iteration << ".csv"; - std::string filename = oss.str(); - std::ofstream myfile(filename); - - std::cout << "Launching L1 MSHR ubench" << std::endl; - - uint64_t *timing = - (uint64_t *)malloc(TOTAL_THREADS * iteration * sizeof(uint64_t)); - uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *posArray = (uint32_t *)malloc(array_size * sizeof(uint32_t)); - // uint32_t *val_array = (uint32_t*) malloc(array_size*sizeof(uint32_t)); - - for (uint32_t i = 0; i < array_size; i++) - posArray[i] = (i + stride) % array_size; - - uint64_t *timing_g; - uint32_t *dsink_g; - uint32_t *posArray_g; - - gpuErrchk( - cudaMalloc(&timing_g, TOTAL_THREADS * iteration * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, array_size * sizeof(uint32_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, array_size * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - // cudaFuncSetAttribute(l1_mshr, - // cudaFuncAttributePreferredSharedMemoryCarveout, 100); //set shared memory - // size - cudaFuncSetAttribute(l1_mshr, cudaFuncAttributeMaxDynamicSharedMemorySize, - shared_mem_size_byte); - l1_mshr<<>>( - timing_g, dsink_g, posArray_g, stride, array_size, iteration); - - // gpuErrchk( cudaPeekAtLastError() ); - - gpuErrchk(cudaMemcpy(timing, timing_g, - TOTAL_THREADS * iteration * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - myfile << "thread_num,timing1,timing2,timing3,timing4,timing5,timing6\n"; - for (uint32_t thr = 0; thr < TOTAL_THREADS; thr += 32) { - for (uint32_t itr = 0; itr < iteration; itr++) { - if (itr != 0) { - myfile << ","; - - } else { - myfile << thr << ","; - } - myfile << timing[thr * iteration + itr]; - } - myfile << "\n"; - } - - free(timing); - free(dsink); - free(posArray); - gpuErrchk(cudaFree(timing_g)); - gpuErrchk(cudaFree(dsink_g)); - gpuErrchk(cudaFree(posArray_g)); - - myfile.close(); - std::cout << "Saving L1 MSHR data at " << filename << std::endl; - - return; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t stride, iteration; - int shared_mem_size_byte = MAX_SHARED_MEM_SIZE_PER_BLOCK; - /* - #ifdef VOLTA_HW_DEF_H - uint32_t l1_cache_size = L1_SIZE-shared_mem_size_byte; //volta - sharedmem is a partition of L1 #else uint32_t l1_cache_size = L1_SIZE; - #endif - */ - // measure line size and mshr - stride = 100; - iteration = 6; - uint64_t array_size = 1024 * 1024 * 1024; - l1_structure(stride, array_size, shared_mem_size_byte, iteration); - - /* - //measure associativity - stride = 8; - iteration = 1; - for (array_size=l1_cache_size/4; array_size<(l1_cache_size+512)/4; - array_size++){ - //l1_structure (stride, array_size, shared_mem_size_byte, iteration); - } - */ - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx deleted file mode 100644 index a8a130503..000000000 Binary files a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/mshr.xlsx and /dev/null differ diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile deleted file mode 100644 index c865f2de3..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_sector.cu - -EXE = l1_sector - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu deleted file mode 100644 index 0d2aaedbf..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu +++ /dev/null @@ -1,134 +0,0 @@ -// Is L1 sector? - -#include -#include -#include -#include -#include -using namespace std; - -#define L1_SIZE_FLOAT L1_SIZE / 4 -// allocate large array size, larger than L1 size -#define ARRAY_SIZE L1_SIZE_FLOAT * 2 -// we know the sector size is 8 floats (32B) from the l1_access_grain ubench -#define SECTOR_SIZE 8 - -#include "../../../hw_def/hw_def.h" - -__global__ void l1_sector(uint32_t *startClk, uint32_t *stopClk, float *dsink, - float *posArray) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - - // populate l1 cache to warm up - for (uint32_t i = tid; i < L1_SIZE_FLOAT; i += blockDim.x) { - float *ptr = posArray + i; - // use ca modifier to cache the load in L1 - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.ca.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(ptr) - : "memory"); - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // kicks out one of the cache line and read a sector - if (uid == 0) { - sink0 += posArray[L1_SIZE_FLOAT + 1]; - } - - asm volatile("bar.sync 0;"); - - uint32_t start = 0; - uint32_t stop = 0; - - // start timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // load data from l1 cache and accumulate - float *ptr = posArray + tid * SECTOR_SIZE; - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.ca.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(ptr) - : "memory"); - - // stop timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - float *posArray_g; - float *dsink_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - std::cout << "Launching L1 sector ubench" << std::endl; - - l1_sector<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - ofstream myfile; - myfile.open("data.csv"); - myfile << "sectror_id, lat" << endl; - for (unsigned i = 0; i < TOTAL_THREADS; i++) { - myfile << i << "," << stopClk[i] - startClk[i] << endl; - } - - std::cout << "Saving L1 sector data at data.csv" << std::endl; - - myfile.close(); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile deleted file mode 100644 index e5c3c78ec..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l1_shared_bw.cu - -EXE = l1_shared_bw - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu deleted file mode 100644 index e16b2124d..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu +++ /dev/null @@ -1,119 +0,0 @@ -// This benchmark measures the maximum read bandwidth of shared memory and L1 at -// the same time - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -// array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) -// 32 KB of shd memory -#define SHARED_MEM_SIZE (32 * 1024 / 4) -#define ITERS 4096 - -__global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk, - uint32_t *dsink, uint32_t *l1, uint32_t stride) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - uint32_t n_threads = blockDim.x * gridDim.x; - - register uint32_t tmp_s = uid; - register uint32_t tmp_l1 = uid; - uint32_t start = 0; - uint32_t stop = 0; - - __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory - // uint32_t s[SHARED_MEM_SIZE]; - // one thread to initialize the pointer-chasing array - for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads) - s[i] = (i + stride + 7) % SHARED_MEM_SIZE; - - // warmup l1 cache - for (uint32_t i = 0; i < ARRAY_SIZE; ++i) { - tmp_l1 = l1[tmp_l1]; - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // load data from l1 cache and accumulate - for (uint32_t i = 0; i < ITERS; ++i) { - tmp_s = s[tmp_s]; - tmp_l1 = l1[tmp_l1]; - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // sink0 = tmp; - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = tmp_s + tmp_l1; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - - uint32_t *posArray = (uint32_t *)malloc(ARRAY_SIZE * sizeof(uint32_t)); - - uint32_t stride = 1024; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (i + stride + 1) % ARRAY_SIZE; - - uint32_t *posArray_g; - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint32_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint32_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - shared_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, stride); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - double bw; - bw = (double)(ITERS * TOTAL_THREADS * 4 * 2) / - ((double)(stopClk[0] - startClk[0])); - printf("Shared Memory Bandwidth = %f (byte/clk/SM)\n", bw); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile deleted file mode 100644 index 7855682c9..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = l1_write_policy.cu - -EXE = l1_write_policy - -NVCC_FLGAS = -Xptxas -dlcm=ca - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu b/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu deleted file mode 100644 index c92ef1572..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu +++ /dev/null @@ -1,115 +0,0 @@ -/* -This microbenchmark detects L1 write policy -check the nvprof or nvsight for received l1 reads and writes to detect the -policy check the comments below for further details and also see our arvix -paper: https://arxiv.org/pdf/1810.07269.pdf - - to run the program with nvsight - make nvsight ./l1_write_policy -*/ - -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -#define THREADS_NUM 1 // Launch only one thread -#define ARRAY_SIZE 1024 // size of the array - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Device code - -/* -check the nvprof or nvsight to see the L1 reads and write hits -in the below mb, we have 6 reads and 4 writes - -1. Check the write allocation policy - we have three policies: write no-allocate vs write-allocate fetch-on-write vs -vs write-allocate sub-sector write?? if only two write hits (C[i] and A[i] at -lines 3&4) ==> then write no-allocate else if three write hits (C[i+1], C[i] and -A[i] at lines 2&3&4) then it is write-allocate But is it fect-on-write or -sub-sector fetch-on-read ? if one read miss (A[i] at line1) and 5 reads hits ==> -then fetch-on-write, as the miss at line1 will fetch the whole sector, and C[i] -and line 3 is hit else if two read misses (A[i] at lines 1 and C[i] at line 3) -==> then sub-sector write with write bit-mask, the sector will be fetched on -read miss, not write miss - -2. check if write-back or write-through -check the L2 writes, if four writes are received ==> then write-through -if less than four writes ==> then write-back - -to run the program with nvsight - make nvsight ./l1_write_policy -stats to look at: -l1 reads: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum -l1 writes:l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum -l1 read hits:l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum -l1 write hits: l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum - -The comments below shows a case of write-allocate with sub-sector mask as in -Pascal, Volta, Turing and Ampere HW Results found: Pascal: write-no allocate + -write-through Volta, Turing and Ampere: write allocate & sub-sector write + -write-through -*/ - -__global__ void write_policy_mb(float *A, float *C) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i == 0) { - C[i] = A[i]; // write to C[i] is a miss (cache line is missing) - C[i + 1] = A[i]; // write to C[i+1] is a hit (cache line is found) - C[i] = C[i] + A[i]; // read of C[i] is a miss (entire sector is missing, - // fetch it from memory) - A[i] = - C[i] + C[i + 1]; // read C[i] and C[i+1] are hits (entire sector exists) - } -} - -////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; - THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; - - // create 4KB buffers of A&C - assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); - - float *A = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *C = (float *)malloc(ARRAY_SIZE * sizeof(float)); - - float *A_g; - float *C_g; - - gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float))); - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - A[i] = (float)i; - - gpuErrchk( - cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - - write_policy_mb<<<1, THREADS_NUM>>>(A_g, C_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - std::cout << "\nThis microbenchmark detects L1 write policy.\n"; - std::cout << "check the nvprof or nvsight for received l1 reads and writes " - "to detect the policy.\n"; - std::cout << "see the code comments for further details\n"; - std::cout - << "to run the program with nvsight: make nvsight ./l1_write_policy\n"; - std::cout - << "stats to look at: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum & " - "l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum & " - "l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum & " - "l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum \n\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile deleted file mode 100644 index 810ff0e80..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l2_access_grain.cu - -EXE = l2_access_grain - -NVCC_FLGAS = -Xptxas -dlcm=cg - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu deleted file mode 100644 index 1c6bac182..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_access_grain/l2_access_grain.cu +++ /dev/null @@ -1,105 +0,0 @@ -/* -This benchmark measures l2 access granularity for differnet strides -check the nvprof or nvsight for received l2 reads and writes -for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf - -Compile this file using the following command to disable L1 cache: - nvcc -Xptxas -dlcm=cg l2_sector_grain.cu - -run the program with nsight - make nvsight ./l2_access_grain - - Result: All Nvidia HW generation since kepler has 32B access granularity - */ - -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Device code -__global__ void l2_stride_cons(const float *A, float *C, int stride) - -{ - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - C[i * stride] = A[i * stride]; -} - -__global__ void l2_stride(const float *A, float *C, int stride) - -{ - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - C[((i / stride) * 32) + (i % stride)] = A[((i / stride) * 32) + (i % stride)]; -} - -// Host code -void l2_stride(int N, int threadsPerBlock, int stride) { - // Variables - float *h_A; - float *h_C; - - float *d_A; - float *d_C; - - size_t size = N * sizeof(float) * 32; - - // Allocate input vectors h_A and h_B in host memory - h_A = (float *)malloc(size); - h_C = (float *)malloc(size); - - // fill array - for (uint32_t i = 0; i < N; i++) - h_A[i] = (float)i; - - // Allocate vectors in device memory - gpuErrchk(cudaMalloc((void **)&d_A, size)); - gpuErrchk(cudaMalloc((void **)&d_C, size)); - - // Copy vectors from host memory to device memory - gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); - - // Invoke kernel - int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock); - - l2_stride<<>>(d_A, d_C, stride); - gpuErrchk(cudaPeekAtLastError()); - - // Copy result from device memory to host memory - // h_C contains the result in host memory - gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); - - // Free device memory - if (d_A) - cudaFree(d_A); - if (d_C) - cudaFree(d_C); - - // Free host memory - if (h_A) - free(h_A); - if (h_C) - free(h_C); -} -////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - intilizeDeviceProp(0); - - for (int i = 1; i <= WARP_SIZE; ++i) - l2_stride(WARP_SIZE, WARP_SIZE, i); - - std::cout << "\nThis benchmark measures l2 access granularity for differnet " - "strides.\n"; - std::cout << "check the nvprof or nvsight for received l2 reads and write.\n"; - std::cout - << "to run the program with nsight: make nvsight ./l2_access_grain\n"; - std::cout << "stats to look at: lts__t_sectors_srcunit_tex_op_read.sum and " - "lts__t_sectors_srcunit_tex_op_write.sum \n\n"; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile deleted file mode 100644 index 39ad775b7..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l2_bw_128.cu - -EXE = l2_bw_128 - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu deleted file mode 100644 index f12591f45..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu +++ /dev/null @@ -1,148 +0,0 @@ -// This code is a modification of L2 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L2 cache for 32f -// Compile this file using the following command to disable L1 cache: -// nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 256 - -/* -L2 cache is warmed up by loading posArray and adding sink -Start timing after warming up -Load posArray and add sink to generate read traffic -Repeat the previous step while offsetting posArray by one each iteration -Stop timing and store data -*/ - -__global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, - float *posArray, unsigned ARRAY_SIZE) { - // block and thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink0 = 0; - float sink1 = 0; - float sink2 = 0; - float sink3 = 0; - - // warm up l2 cache - for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) { - float *ptr = posArray + i; - // every warp loads all data in l2 cache - // use cg modifier to cache the load in L2 and bypass L1 - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.cg.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink0) - : "l"(ptr) - : "memory"); - } - - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l2 cache and accumulate, - for (uint32_t i = 0; i < REPEAT_TIMES; i++) { - float *ptr = posArray + (((i * warpSize * 4) + uid * 4) % ARRAY_SIZE); - asm volatile("{\t\n" - ".reg .f32 data<4>;\n\t" - "ld.global.cg.v4.f32 {data0,data1,data2,data3}, [%4];\n\t" - "add.f32 %0, data0, %0;\n\t" - "add.f32 %1, data1, %1;\n\t" - "add.f32 %2, data2, %2;\n\t" - "add.f32 %3, data3, %3;\n\t" - "}" - : "+f"(sink0), "+f"(sink1), "+f"(sink2), "+f"(sink3) - : "l"(ptr) - : "memory"); - } - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // store the result - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = sink0 + sink1 + sink2 + sink3; -} - -int main() { - - intilizeDeviceProp(0); - - unsigned ARRAY_SIZE = TOTAL_THREADS * 4 + REPEAT_TIMES * WARP_SIZE * 4; - // Array size must not exceed L2 size - assert(ARRAY_SIZE * sizeof(float) < L2_SIZE); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - float *posArray_g; - float *dsink_g; - uint64_t *startClk_g; - uint64_t *stopClk_g; - - assert(ARRAY_SIZE < L2_SIZE); - - for (int i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - l2_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, ARRAY_SIZE); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float bw, BW; - unsigned long long data = - (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(float) * 4; - uint64_t total_time = stopClk[0] - startClk[0]; - bw = (float)(data) / ((float)(stopClk[0] - startClk[0])); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; - float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " - << BW << "(GB/s)\n"; - std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile deleted file mode 100644 index 37f8c3a92..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = l2_bw_32f.cu - -EXE = l2_bw_32f - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu deleted file mode 100644 index 9d4aa80d7..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu +++ /dev/null @@ -1,141 +0,0 @@ -// This code is a modification of L2 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L2 cache for 32f -// Compile this file using the following command to disable L1 cache: -// nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 2048 - -/* -L2 cache is warmed up by loading posArray and adding sink -Start timing after warming up -Load posArray and add sink to generate read traffic -Repeat the previous step while offsetting posArray by one each iteration -Stop timing and store data -*/ - -__global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, - float *posArray, unsigned ARRAY_SIZE) { - // block and thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - - // a register to avoid compiler optimization - float sink = 0; - - // warm up l2 cache - for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) { - float *ptr = posArray + i; - // every warp loads all data in l2 cache - // use cg modifier to cache the load in L2 and bypass L1 - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.cg.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink) - : "l"(ptr) - : "memory"); - } - - asm volatile("bar.sync 0;"); - - // start timing - uint64_t start = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from l2 cache and accumulate, - for (uint32_t i = 0; i < REPEAT_TIMES; i++) { - float *ptr = posArray + (i * warpSize) + uid; - asm volatile("{\t\n" - ".reg .f32 data;\n\t" - "ld.global.cg.f32 data, [%1];\n\t" - "add.f32 %0, data, %0;\n\t" - "}" - : "+f"(sink) - : "l"(ptr) - : "memory"); - } - asm volatile("bar.sync 0;"); - - // stop timing - uint64_t stop = 0; - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // store the result - startClk[bid * blockDim.x + tid] = start; - stopClk[bid * blockDim.x + tid] = stop; - dsink[bid * blockDim.x + tid] = sink; -} - -int main() { - intilizeDeviceProp(0); - - unsigned ARRAY_SIZE = TOTAL_THREADS + REPEAT_TIMES * WARP_SIZE; - assert(ARRAY_SIZE * sizeof(float) < - L2_SIZE); // Array size must not exceed L2 size - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - - float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *dsink = (float *)malloc(TOTAL_THREADS * sizeof(float)); - - float *posArray_g; - float *dsink_g; - uint64_t *startClk_g; - uint64_t *stopClk_g; - - for (int i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (float)i; - - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(float))); - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(float), - cudaMemcpyHostToDevice)); - - l2_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, ARRAY_SIZE); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(float), - cudaMemcpyDeviceToHost)); - - float bw, BW; - unsigned long long data = - (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(float); - uint64_t total_time = stopClk[0] - startClk[0]; - // uint64_t total_time = - // *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS]); - bw = (float)(data) / ((float)(total_time)); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; - float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " - << BW << "(GB/s)\n"; - std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile deleted file mode 100644 index 131ec359c..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l2_bw_64f.cu - -EXE = l2_bw_64f - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu deleted file mode 100644 index 64f69c506..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu +++ /dev/null @@ -1,140 +0,0 @@ -// This code is a modification of L2 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the maximum read bandwidth of L2 cache for 64 bit -// Compile this file using the following command to disable L1 cache: -// nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu - -#include -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define REPEAT_TIMES 2048 - -/* -L2 cache is warmed up by loading posArray and adding sink -Start timing after warming up -Load posArray and add sink to generate read traffic -Repeat the previous step while offsetting posArray by one each iteration -Stop timing and store data -*/ - -__global__ void l2_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, - double *posArray, unsigned ARRAY_SIZE) { - // block and thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - - // a register to avoid compiler optimization - double sink = 0; - - // warm up l2 cache - for (uint32_t i = uid; i < ARRAY_SIZE; i += blockDim.x * gridDim.x) { - double *ptr = posArray + i; - // every warp loads all data in l2 cache - // use cg modifier to cache the load in L2 and bypass L1 - asm volatile("{\t\n" - ".reg .f64 data;\n\t" - "ld.global.cg.f64 data, [%1];\n\t" - "add.f64 %0, data, %0;\n\t" - "}" - : "+d"(sink) - : "l"(ptr) - : "memory"); - } - - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // benchmark starts - // load data from l2 cache and accumulate, - for (uint32_t i = 0; i < REPEAT_TIMES; i++) { - double *ptr = posArray + (i * warpSize) + uid; - asm volatile("{\t\n" - ".reg .f64 data;\n\t" - "ld.global.cg.f64 data, [%1];\n\t" - "add.f64 %0, data, %0;\n\t" - "}" - : "+d"(sink) - : "l"(ptr) - : "memory"); - } - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // store the result - startClk[bid * blockDim.x + tid] = start; - stopClk[bid * blockDim.x + tid] = stop; - dsink[bid * blockDim.x + tid] = sink; -} - -int main() { - - intilizeDeviceProp(0); - - unsigned ARRAY_SIZE = TOTAL_THREADS + REPEAT_TIMES * WARP_SIZE; - // Array size must not exceed L2 size - assert(ARRAY_SIZE * sizeof(double) < L2_SIZE); - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - - double *posArray = (double *)malloc(ARRAY_SIZE * sizeof(double)); - double *dsink = (double *)malloc(TOTAL_THREADS * sizeof(double)); - - double *posArray_g; - double *dsink_g; - uint32_t *startClk_g; - uint32_t *stopClk_g; - - for (int i = 0; i < ARRAY_SIZE; i++) - posArray[i] = (double)i; - - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(double))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(double))); - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - - gpuErrchk(cudaMemcpy(posArray_g, posArray, ARRAY_SIZE * sizeof(double), - cudaMemcpyHostToDevice)); - - l2_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, ARRAY_SIZE); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(double), - cudaMemcpyDeviceToHost)); - - float bw, BW; - unsigned long long data = - (unsigned long long)TOTAL_THREADS * REPEAT_TIMES * sizeof(double); - uint64_t total_time = stopClk[0] - startClk[0]; - bw = (float)(data) / ((float)(total_time)); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; - float max_bw = get_num_channels(MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " - << BW << "(GB/s)\n"; - std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile deleted file mode 100644 index 281ce63e2..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = l2_config.cu - -EXE = l2_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu deleted file mode 100644 index 677bc2ba5..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu +++ /dev/null @@ -1,89 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -// We know the below information from running our ubench, we copy and paste the -// ubench results below manullay -// TODO: we will automate this process - -// we know sector size from l2_access_grain ubench -#define L2_CACHE_LINE_SIZE 128 -#define L2_SECTOR_SIZE 32 -#define IS_SECTOR 1 - -// It is hard to know the exact l2 assoc from ubenhmarking -// Thus, based on previous work, we assume assoc is constant and = 16 -// similar to AMD GPU: -// https://www.techpowerup.com/gpu-specs/docs/amd-gcn1-architecture.pdf -#define L2_CACHE_ASSOC 16 - -// L2 cache cache since kepler and above is write-allocate, subsector-write, -// write-back. We know that from l2_write_policy ubench and has been consistent -// since kepler. Change it accordingly if it changes in new generations -static const char *L2_Cache_Write_Policy = ",L:B:m:L:"; - -// For now, accel-sim only supoprts ipoly for 64 and less -#define ACCELSIM_IPOLY_HASH_SUPPORT 64 -// 8 byte for icnt control -#define ACCELSIM_ICNT_CONTROL 8 - -int main() { - intilizeDeviceProp(0); - - if (deviceProp.l2CacheSize) { - printf("L2 Cache Size = %.0f MB\n", - static_cast(deviceProp.l2CacheSize / 1048576.0f)); - } - - unsigned mem_channel = get_num_channels(MEM_BITWIDTH, DRAM_MODEL); - unsigned l2_banks_num = mem_channel * L2_BANKS_PER_MEM_CHANNEL; - - std::cout << "L2 Banks number = " << l2_banks_num << std::endl; - - if (ACCEL_SIM_MODE) { - - std::cout << "\n//Accel_Sim config: \n"; - - unsigned l2_size_per_bank = L2_SIZE / l2_banks_num; - unsigned assoc, sets_num; - char set_indexing = 'L'; // by default assume linear indexing - char is_sector = IS_SECTOR ? 'S' : 'N'; - if (isPowerOfTwo(l2_size_per_bank)) { - assoc = L2_CACHE_ASSOC; - sets_num = l2_size_per_bank / L2_CACHE_LINE_SIZE / assoc; - if (sets_num <= ACCELSIM_IPOLY_HASH_SUPPORT) - set_indexing = 'P'; - else - set_indexing = 'X'; // bitwise xoring - } else { - // if not power of two, assume it is 24, as most NVidia GPU L2 cache size - // that is not power of two, is actually divisble by 24 - assoc = 24; - // ensure that our assumption is true - assert((l2_size_per_bank / L2_CACHE_LINE_SIZE) % assoc == 0); - sets_num = l2_size_per_bank / L2_CACHE_LINE_SIZE / assoc; - if (isPowerOfTwo(sets_num) && l2_banks_num <= ACCELSIM_IPOLY_HASH_SUPPORT) - set_indexing = 'P'; - else if (isPowerOfTwo(sets_num)) - set_indexing = 'X'; // bitwise xoring - } - - std::cout << "-gpgpu_n_sub_partition_per_mchannel " - << L2_BANKS_PER_MEM_CHANNEL << std::endl; - std::cout << "-icnt_flit_size " - << L2_BANK_WIDTH_in_BYTE + ACCELSIM_ICNT_CONTROL - << std::endl; // 8bytes for control - if (isPowerOfTwo(l2_banks_num) && - l2_banks_num <= ACCELSIM_IPOLY_HASH_SUPPORT) - std::cout << "-gpgpu_memory_partition_indexing 2" << std::endl; - else - std::cout << "-gpgpu_memory_partition_indexing 0" << std::endl; - std::cout << "-gpgpu_cache:dl2 " << is_sector << ":" << sets_num << ":" - << L2_CACHE_LINE_SIZE << ":" << assoc << L2_Cache_Write_Policy - << set_indexing << "," - << "A:192:4,32:0,32" << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile deleted file mode 100644 index 784f28d0e..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l2_copy_engine.cu - -EXE = l2_copy_engine - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu deleted file mode 100644 index bcb4988b3..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu +++ /dev/null @@ -1,136 +0,0 @@ -// This ubench meaures if DMA memory copy is cached in L2 by default - -#include -#include -#include // std::abs -#include // std::abs -#include // std::accumulate -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" -#include "../l2_lat/l2_lat.h" - -#define REPEAT_TIMES 32768 // iterate over the array ITERS times -#define ARRAY_SIZE_L2 32768 - -__global__ void l2_lat_no_warmpu(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) { - - // thread index - uint32_t tid = threadIdx.x; - - // do pointer-chasing without warmpup - if (tid == 0) { - - uint64_t *ptr = posArray + tid; - uint64_t ptr1, ptr0; - - // initialize the pointers with the start address - // use cg modifier to cache the load in L2 and bypass L1 - asm volatile("{\t\n" - "ld.global.cg.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr1) - : "l"(ptr) - : "memory"); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // pointer-chasing ITERS times - // use cg modifier to cache the load in L2 and bypass L1 - for (uint32_t i = 0; i < REPEAT_TIMES; ++i) { - asm volatile("{\t\n" - "ld.global.cg.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr0) - : "l"((uint64_t *)ptr1) - : "memory"); - ptr1 = ptr0; // swap the register for the next load - } - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[tid] = start; - stopClk[tid] = stop; - dsink[tid] = ptr1; - } -} - -int main() { - intilizeDeviceProp(0); - - unsigned THREADS_NUM = 1; - - // Array size must not exceed L2 size - assert(ARRAY_SIZE_L2 * sizeof(uint64_t) < L2_SIZE); - - uint64_t *posArray = (uint64_t *)malloc(ARRAY_SIZE_L2 * sizeof(uint64_t)); - uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *posArray_g; - uint64_t *dsink_g; - - uint64_t stride = 1; - - gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE_L2 * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t))); - - // initilze pointer-chasing on the CPU side - for (uint64_t i = 0; i < ARRAY_SIZE_L2; i++) { - uint64_t *tmp = posArray_g + ((i + stride) % ARRAY_SIZE_L2); - posArray[i] = (uint64_t)tmp; - } - - gpuErrchk(cudaMemcpy(posArray_g, posArray, sizeof(uint64_t) * ARRAY_SIZE_L2, - cudaMemcpyHostToDevice)); - - // here we measure the latency of the request without warmup - l2_lat_no_warmpu<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - float l2_nowarmp_lat = (float)(stopClk[0] - startClk[0]) / REPEAT_TIMES; - printf("L2 Latency no-warmp up = %12.4f cycles \n", l2_nowarmp_lat); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - // then we measure L2 hit latncy with warmpup - float l2_hit_lat2 = l2_hit_lat(); - - // if the latency is close to the l2 hit latency, then the memcpy are cached - // by default at L2 - float error = (abs(l2_nowarmp_lat - l2_hit_lat2) / l2_hit_lat2) * 100; - bool cached = (error < 10.0f); - if (cached) - printf("Is memcpy cached in L2? Yes, error=%2.1f\n", error); - else - printf("Is memcpy cached in L2? No, error=%2.1f\n", error); - - if (ACCEL_SIM_MODE) { - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_perf_sim_memcpy " << cached << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile deleted file mode 100644 index 13a411fdf..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = l2_lat.cu - -EXE = l2_lat - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu deleted file mode 100644 index d54508fcd..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu +++ /dev/null @@ -1,19 +0,0 @@ -#include "../../l1_cache/l1_lat/l1_lat.h" -#include "l2_lat.h" - -int main() { - - intilizeDeviceProp(0); - - float lat2 = l2_hit_lat(); - - if (ACCEL_SIM_MODE) { - float lat1 = l1_lat(); - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_l2_rop_latency " << (unsigned)(lat2 - lat1) - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h deleted file mode 100644 index d09381516..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h +++ /dev/null @@ -1,117 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the latency of L2 latency using pointer-chasing - -#include -#include -#include - -#include - -#include "../../../hw_def/hw_def.h" - -#define ITERS 32768 // iterate over the array ITERS times -#define ARRAY_SIZE 4096 - -__global__ void l2_hit_lat(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) { - - // thread index - uint32_t tid = threadIdx.x; - - // initialize pointer-chasing array with just one thread - // warp up L2 cache and ensure all next accesses hit - if (tid == 0) { - for (uint32_t i = 0; i < (ARRAY_SIZE - 1); i++) - posArray[i] = (uint64_t)(posArray + i + 1); - - posArray[ARRAY_SIZE - 1] = (uint64_t)posArray; - } - - if (tid == 0) { - - uint64_t *ptr = posArray + tid; - uint64_t ptr1, ptr0; - - // initialize the pointers with the start address - // use cg modifier to cache the load in L2 and bypass L1 - asm volatile("{\t\n" - "ld.global.cg.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr1) - : "l"(ptr) - : "memory"); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // pointer-chasing ITERS times - // use cg modifier to cache the load in L2 and bypass L1 - for (uint32_t i = 0; i < ITERS; ++i) { - asm volatile("{\t\n" - "ld.global.cg.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr0) - : "l"((uint64_t *)ptr1) - : "memory"); - ptr1 = ptr0; // swap the register for the next load - } - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[tid] = start; - stopClk[tid] = stop; - dsink[tid] = ptr1; - } -} - -int l2_hit_lat() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - THREADS_PER_BLOCK = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - - // Array size must not exceed L2 size - assert(ARRAY_SIZE * sizeof(uint64_t) < L2_SIZE); - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *posArray_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t))); - - l2_hit_lat<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - - float lat = (float)(stopClk[0] - startClk[0]) / ITERS; - printf("L2 Hit Latency = %12.4f cycles \n", lat); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return lat; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile deleted file mode 100644 index 5e9ac9ebf..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = l2_write_policy.cu - -EXE = l2_write_policy - -NVCC_FLGAS = -Xptxas -dlcm=cg - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu b/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu deleted file mode 100644 index 3514782e5..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu +++ /dev/null @@ -1,115 +0,0 @@ -/* -This microbenchmark detects L2 write policy -check the nvprof or nvsight for received l2 reads and writes to detect the -policy check the comments below for further details and also see our arvix -paper: https://arxiv.org/pdf/1810.07269.pdf - -Compile this file using the following command to disable L1 cache: - nvcc -Xptxas -dlcm=cg l2_write_policy.cu - -to run the program with nvsight - make nvsight ./l2_write_policy -*/ - -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -#define THREADS_NUM 1 // Launch only one thread -#define ARRAY_SIZE 1024 // size of the array - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Device code - -/* -check the nvprof or nvsight to see the L2/DRAM reads and write hits -in the below mb, we have 6 reads and 4 writes - -1. Check the write allocation policy - we have three policies: write no-allocate vs write-allocate fetch-on-write vs -vs write-allocate sub-sector write?? if only two write hits (C[i] and A[i] at -lines 3&4) ==> then write no-allocate, else if three write hits (C[i+1], C[i] -and A[i] at lines 2&3&4) then it is write-allocate. if one read miss (A[i] at -line1) and 5 reads hits ==> then fetch-on-write, as the miss at line1 will fetch -the whole sector, and C[i] and line 3 is hit else if two read misses (A[i] at -lines 1 and C[i] at line 3) ==> then sub-sector write with write bit-mask, the -sector will be fetched on read miss, not write miss - -2. check if write-back or write-through -check the DRAM writes, if four writes are received ==> then write-through -if less than four writes ==> then write-back - -to run the program with nvsight: - make nvsight ./l2_write_policy - - stats to look at: -l2 reads: lts__t_sectors_srcunit_tex_op_read.sum -l2 writes: lts__t_sectors_srcunit_tex_op_write.sum -l2 read hits: lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum -l2 write hits: lts__t_sectors_srcunit_tex_op_write_lookup_hit. - -The comments below shows a case of write-allocate with sub-sector mask as in -Pascal, Volta, Turing and Ampere HW Results found: Pascal, Volta, Turing and -Ampere: write allocate & sub-sector write + write-back -*/ - -__global__ void write_policy_mb(float *A, float *C) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i == 0) { - C[i] = A[i]; // write to C[i] is a miss (cache line is missing) - C[i + 1] = A[i]; // write to C[i+1] is a hit (cache line is found) - C[i] = C[i] + A[i]; // read of C[i] is a miss (entire sector is missing, - // fetch it from memory) - A[i] = - C[i] + C[i + 1]; // read C[i] and C[i+1] are hits (entire sector exists) - } -} - -////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; - THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; - - // create 4KB buffers of A&C - assert(ARRAY_SIZE * sizeof(float) < L2_SIZE); - - float *A = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *C = (float *)malloc(ARRAY_SIZE * sizeof(float)); - - float *A_g; - float *C_g; - - gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float))); - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) - A[i] = (float)i; - - gpuErrchk( - cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - - write_policy_mb<<<1, THREADS_NUM>>>(A_g, C_g); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - std::cout << "\nThis microbenchmark detects L2 write policy.\n"; - std::cout << "check the nvprof or nvsight for received L2 reads and writes " - "to detect the policy.\n"; - std::cout << "see the code comments for further details\n"; - std::cout << "to run the program with nvsight: make nvsight ./2\n"; - std::cout << "stats to look at: llts__t_sectors_srcunit_tex_op_read.sum & " - "lts__t_sectors_srcunit_tex_op_write.sum & " - "lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum & " - "lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum \n\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile deleted file mode 100644 index 6a97c1634..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = mem_atom_size.cu - -EXE = mem_atom_size - -NVCC_FLGAS = -Xptxas -dlcm=cg - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu deleted file mode 100644 index 69b2370e1..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_atom_size/mem_atom_size.cu +++ /dev/null @@ -1,117 +0,0 @@ -/* -This benchmark measures mem atom size -check the nvprof or nvsight for received mem reads and writes -for further details, see our arvix paper: https://arxiv.org/pdf/1810.07269.pdf - -Compile this file using the following command to disable L1 cache: - nvcc -Xptxas -dlcm=cg mem_atom_size.cu - -run the program with nsight - make nvsight ./mem_atom_size - -Result: many Nvidia HW generation since kepler has 32B mem_atom_size granularity -However, It seems some pascal and volta GPUs have atom size of 64B larger than -the L2 access grain (32B). We asked Nvidia about this weird behavior, and that's -their reply: https://forums.developer.nvidia.com/t/pascal-l1-cache/49571/15 -*/ - -#include -#include -#include -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Device code -__global__ void mem_stride_cons(const float *A, float *C, int stride) - -{ - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - C[i * stride] = A[i * stride]; -} - -// Host code -void mem_stride(int N, int threadsPerBlock, int stride) { - // Variables - float *h_A; - float *h_C; - - float *d_A; - float *d_C; - - size_t size = N * sizeof(float) * 32; - - // Allocate input vectors h_A and h_B in host memory - h_A = (float *)malloc(size); - h_C = (float *)malloc(size); - - // fill array - for (uint32_t i = 0; i < N; i++) - h_A[i] = (float)i; - - // Allocate vectors in device memory - gpuErrchk(cudaMalloc((void **)&d_A, size)); - gpuErrchk(cudaMalloc((void **)&d_C, size)); - - // Copy vectors from host memory to device memory - gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); - - // Invoke kernel - int blocksPerGrid = ((N + threadsPerBlock - 1) / threadsPerBlock); - - mem_stride_cons<<>>(d_A, d_C, stride); - gpuErrchk(cudaPeekAtLastError()); - - // Copy result from device memory to host memory - // h_C contains the result in host memory - gpuErrchk(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); - - // Free device memory - if (d_A) - cudaFree(d_A); - if (d_C) - cudaFree(d_C); - - // Free host memory - if (h_A) - free(h_A); - if (h_C) - free(h_C); -} -////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - intilizeDeviceProp(0); - - // make the array very large to avoid L2 cache resident - // run the threads with stride of 32 (128B) to avoid any coalescing - mem_stride((L2_SIZE / sizeof(float)) * 2, 256, 32); - - std::cout << std::endl - << "This benchmark measures mem atom size granularity" << std::endl; - - std::cout << "check the nvprof or nvsight for received mem reads and writes" - << std::endl; - std::cout << "to run the program with nsight: make nvsight ./l2_access_grain" - << std::endl; - std::cout - << "stats to look at: dram__sectors_read.sum & dram__sectors_write.sum & " - "dram__bytes_read.sum & dram__sectors_read.sum" - << std::endl - << std::endl; - - std::cout - << "we launched " << (L2_SIZE / sizeof(float)) * 2 - << " read memory reqs (1 req per thread) with a stride of 32 (128 bytes)" - << std::endl; - std::cout << "if the number of memory reads is the same as read reqs, then " - "mem atom size is 32B" - << std::endl; - std::cout << "if the number of memory reads is 2X issued read reqs, then mem " - "atom size is 64B, etc." - << std::endl - << std::endl; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile deleted file mode 100644 index 9409ddff2..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = mem_bw.cu - -EXE = mem_bw - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu deleted file mode 100644 index 62da9f86b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu +++ /dev/null @@ -1,148 +0,0 @@ -// This benchmark measures the maximum read bandwidth of GPU memory -// Compile this file using the following command to disable L1 cache: -// nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt mem_bw.cu - -// This code have been tested on Volta V100 architecture -// You can check the mem BW from the nvprof and nvsight -// (dram_read_throughput+dram_write_throughput) - -// to run the program with nvsight -// make nvsight ./mem_bw - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -/* -Send as many as float4 read requests on the flight to increase DRAM row buffer -locality and hit the max BW -*/ - -__global__ void mem_bw(float *A, float *B, float *C, float *D, float *E, - float *F, uint32_t *startClk, uint32_t *stopClk, - unsigned ARRAY_SIZE) { - // block and thread index - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - for (int i = idx; i < ARRAY_SIZE / 4; i += blockDim.x * gridDim.x) { - float4 a1 = reinterpret_cast(A)[i]; - float4 b1 = reinterpret_cast(B)[i]; - float4 d1 = reinterpret_cast(D)[i]; - float4 e1 = reinterpret_cast(E)[i]; - float4 f1 = reinterpret_cast(F)[i]; - float4 c1; - - c1.x = a1.x + b1.x + d1.x + e1.x + f1.x; - c1.y = a1.y + b1.y + d1.y + e1.y + f1.y; - c1.z = a1.z + b1.z + d1.z + e1.z + f1.z; - c1.w = a1.w + b1.w + d1.w + e1.w + f1.w; - - reinterpret_cast(C)[i] = c1; - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[idx] = start; - stopClk[idx] = stop; -} - -int main() { - intilizeDeviceProp(0); - - // Array size has to exceed L2 size to avoid L2 cache residence - unsigned ARRAY_SIZE = (L2_SIZE / sizeof(float)) * 2; - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - float *A = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *B = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *C = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *D = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *E = (float *)malloc(ARRAY_SIZE * sizeof(float)); - float *F = (float *)malloc(ARRAY_SIZE * sizeof(float)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - float *A_g; - float *B_g; - float *C_g; - float *D_g; - float *E_g; - float *F_g; - - for (uint32_t i = 0; i < ARRAY_SIZE; i++) { - A[i] = (float)i; - B[i] = (float)i; - D[i] = (float)i; - E[i] = (float)i; - F[i] = (float)i; - } - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&A_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&B_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&C_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&D_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&E_g, ARRAY_SIZE * sizeof(float))); - gpuErrchk(cudaMalloc(&F_g, ARRAY_SIZE * sizeof(float))); - - gpuErrchk( - cudaMemcpy(A_g, A, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - gpuErrchk( - cudaMemcpy(B_g, B, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - gpuErrchk( - cudaMemcpy(D_g, D, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - gpuErrchk( - cudaMemcpy(E_g, E, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - gpuErrchk( - cudaMemcpy(F_g, F, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice)); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); - - mem_bw<<>>(A_g, B_g, C_g, D_g, E_g, F_g, - startClk_g, stopClk_g, ARRAY_SIZE); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk( - cudaMemcpy(C, C_g, ARRAY_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); - - float mem_bw; - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - - unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types - float max_bw = (float)MEM_BITWIDTH * MEM_CLK_FREQUENCY * 2 / 1e3 / 8; - mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0])); - printf("Mem BW= %f (Byte/Clk)\n", mem_bw); - printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6); - printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw); - printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100); - - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile deleted file mode 100644 index 35aa3045f..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = mem_config.cu - -EXE = mem_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu deleted file mode 100644 index e74931098..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu +++ /dev/null @@ -1,79 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -int main() { - intilizeDeviceProp(0); - - char msg[256]; - snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n", - static_cast(deviceProp.totalGlobalMem / 1073741824.0f)); - std::cout << msg; - std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f - << " Mhz\n"; - std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n"; - std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n"; - std::cout << "Memory channels = " - << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL) << "\n"; - - if (ACCEL_SIM_MODE) { - - std::cout << "\n//Accel_Sim config: \n"; - - std::cout << "-gpgpu_n_mem " - << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL) - << std::endl; - - std::cout << "-gpgpu_n_mem_per_ctrlr " - << dram_model_mem_per_ctrlr[DRAM_MODEL] << std::endl; - std::cout << "-gpgpu_dram_buswidth " << dram_model_bus_width[DRAM_MODEL] / 8 - << std::endl; - std::cout << "-gpgpu_dram_burst_length " - << dram_model_burst_length[DRAM_MODEL] << std::endl; - std::cout << "-dram_data_command_freq_ratio " - << dram_model_freq_ratio[DRAM_MODEL] << std::endl; - - // timing - float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) / - dram_model_freq_ratio[DRAM_MODEL]; - if (DRAM_MODEL == dram_model::HBM) { - // use HBM timing - DDR_Timing timing = HBM_Timing_1000MHZ; - timing.scale_timing_for_new_freq(device_freq_MHZ); - std::cout << "-dram_dual_bus_interface 1" << std::endl; - std::cout << "-gpgpu_dram_timing_opt nbk=" << timing.nbk - << ":CCD=" << get_adjusted_CCD(DRAM_MODEL) - << ":RRD=" << timing.RRD << ":RCD=" << timing.RCD - << ":RAS=" << timing.RAS << ":RP=" << timing.RP - << ":RC=" << timing.RC << ":CL=" << timing.CL - << ":WL=" << timing.WL << ":CDLR=" << timing.CDLR - << ":WR=" << timing.WR << ":nbkgrp=" << timing.nbkgrp - << ":CCDL=" << timing.CCDL << ":RTPL=" << timing.RTPL - << std::endl; - } else { - // use GDDR timing - DDR_Timing timing = GDDR5_Timing_1800MHZ; - timing.scale_timing_for_new_freq(device_freq_MHZ); - std::cout << "-dram_dual_bus_interface 0" << std::endl; - std::cout << "-gpgpu_dram_timing_opt nbk=" << timing.nbk - << ":CCD=" << get_adjusted_CCD(DRAM_MODEL) - << ":RRD=" << timing.RRD << ":RCD=" << timing.RCD - << ":RAS=" << timing.RAS << ":RP=" << timing.RP - << ":RC=" << timing.RC << ":CL=" << timing.CL - << ":WL=" << timing.WL << ":CDLR=" << timing.CDLR - << ":WR=" << timing.WR << ":nbkgrp=" << timing.nbkgrp - << ":CCDL=" << timing.CCDL << ":RTPL=" << timing.RTPL - << std::endl; - } - - // leave the adddress mapping for now as it is - // the number of banks in HBM and GDDR are 16 and atom size is 32B, so the - // mapping should be okay. TODO: make this to be varibale based on memory - // model and size std::cout<<"-gpgpu_mem_address_mask 1"< - -int main() { - - intilizeDeviceProp(0); - - float lat_mem = mem_lat(); - - if (ACCEL_SIM_MODE) { - float lat2 = l2_hit_lat(); - - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-dram_latency " << (unsigned)(lat_mem - lat2) << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h b/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h deleted file mode 100644 index ad7119abd..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h +++ /dev/null @@ -1,118 +0,0 @@ -// This code is a modification of L1 cache benchmark from -//"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": -// https://arxiv.org/pdf/1804.06826.pdf - -// This benchmark measures the latency of GPU memory - -// This code have been tested on Volta V100 architecture - -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define THREADS_NUM \ - 4 // HERE, we launch four threads, to ensure that one request is equal to DRAM - // trascation, 4 thread * 8 bytes = 32 bytes (= min DRAM trascation) -#define ITERS 32768 // 1MB of pointer chasing, ITERS*THREADS_NUM*8 bytes - -__global__ void mem_lat(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink, - unsigned MEM_ARRAY_SIZE) { - // thread index - uint32_t tid = threadIdx.x; - uint32_t uid = blockIdx.x * blockDim.x + tid; - - // initialize pointer-chasing array - for (uint32_t i = uid; i < (MEM_ARRAY_SIZE - THREADS_NUM); - i += blockDim.x * gridDim.x) - posArray[i] = (uint64_t)(posArray + i + THREADS_NUM); - - if (uid < THREADS_NUM) { // only THREADS_NUM has to be active here - - // initialize the tail to reference to the head of the array - posArray[MEM_ARRAY_SIZE - (THREADS_NUM - tid)] = (uint64_t)posArray + tid; - - uint64_t *ptr = posArray + tid; - uint64_t ptr1, ptr0; - - // initialize the pointers with the start address - // Here, we use cache volatile modifier to ignore the L2 cache - asm volatile("{\t\n" - "ld.global.cv.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr1) - : "l"(ptr) - : "memory"); - - // synchronize all threads - asm volatile("bar.sync 0;"); - - uint32_t start = 0; - uint32_t stop = 0; - - // start timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // pointer-chasing ITERS times - // Here, we use cache volatile modifier to ignore the L2 cache - for (uint32_t i = tid; i < ITERS - THREADS_NUM; i += THREADS_NUM) { - asm volatile("{\t\n" - "ld.global.cv.u64 %0, [%1];\n\t" - "}" - : "=l"(ptr0) - : "l"((uint64_t *)ptr1) - : "memory"); - ptr1 = ptr0; // swap the register for the next load - } - - // stop timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[tid] = start; - stopClk[tid] = stop; - dsink[tid] = ptr1; - } -} - -float mem_lat() { - intilizeDeviceProp(0); - - unsigned MEM_ARRAY_SIZE = - (L2_SIZE / sizeof(uint64_t)) * - 2; // pointer-chasing array size in 64-bit. total array size is 7 MB which - // larger than L2 cache size (6 MB in Volta) to avoid l2 cache resident - // from the copy engine - - uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *posArray_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, MEM_ARRAY_SIZE * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t))); - - mem_lat<<>>(startClk_g, stopClk_g, posArray_g, - dsink_g, MEM_ARRAY_SIZE); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - float lat = (float)(stopClk[0] - startClk[0]) / (float)(ITERS / THREADS_NUM); - printf("Mem latency = %12.4f cycles \n", lat); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - return lat; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile deleted file mode 100644 index 33331b9ca..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = shared_bw.cu - -EXE = shared_bw - -NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu deleted file mode 100644 index 203bf3aa8..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB -#define ITERS 4096 - -__global__ void shared_bw(uint64_t *startClk, uint64_t *stopClk, - uint32_t *dsink, uint32_t stride) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - uint32_t n_threads = blockDim.x * gridDim.x; - - // a register to avoid compiler optimization - // uint32_t sink0 = 0; - register uint32_t tmp = uid; - - uint64_t start = 0; - uint64_t stop = 0; - - __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory - // uint32_t s[SHARED_MEM_SIZE]; - // one thread to initialize the pointer-chasing array - for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads) - s[i] = (i + stride) % SHARED_MEM_SIZE; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); - - // load data from shared memory - for (uint32_t i = 0; i < ITERS; ++i) { - tmp = s[tmp]; - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); - - // sink0 = tmp; - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = tmp; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); - - uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - - uint64_t *startClk_g; - uint64_t *stopClk_g; - uint32_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t))); - - shared_bw<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g, - THREADS_PER_BLOCK); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - double bw, BW; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - bw = - (double)(ITERS * TOTAL_THREADS * sizeof(uint32_t)) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile deleted file mode 100644 index af9272e42..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = shared_bw_64.cu - -EXE = shared_bw_64 - -NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu deleted file mode 100644 index 5d7dae05b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define SHARED_MEM_SIZE (32 * 1024 / 8) // 32KB -#define ITERS (4096) - -__global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk, - uint64_t *dsink, uint32_t stride) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - uint32_t n_threads = blockDim.x * gridDim.x; - - // a register to avoid compiler optimization - // uint32_t sink0 = 0; - register uint64_t tmp = uid; - - uint32_t start = 0; - uint32_t stop = 0; - - __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory - // uint32_t s[SHARED_MEM_SIZE]; - // one thread to initialize the pointer-chasing array - for (uint64_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads) - s[i] = (i + stride) % SHARED_MEM_SIZE; - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // start timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // load data from shared memory - for (uint32_t i = 0; i < ITERS; ++i) { - tmp = s[tmp]; - } - - // synchronize all threads - asm volatile("bar.sync 0;"); - - // stop timing - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // sink0 = tmp; - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = tmp; -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; - THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; - - assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); - - uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t))); - - shared_bw<<>>(startClk_g, stopClk_g, dsink_g, - THREADS_PER_BLOCK); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - - double bw, BW; - uint64_t total_time = - *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - - *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); - bw = - (double)(ITERS * TOTAL_THREADS * sizeof(uint64_t)) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; - std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW - << "(GB/s/SM)\n"; - std::cout << "Total Clk number = " << total_time << "\n"; - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile deleted file mode 100644 index 8e85df98b..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -SRC = shared_lat.cu - -EXE = shared_lat - -NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu deleted file mode 100644 index 199392780..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define SHARED_MEM_SIZE (32 * 1024 / 8) -// Launch only one thread to calcaulte the latency using a pointer-chasing -// array technique -#define THREADS_NUM 1 -// iterate over the array ITERS times -#define ITERS 2048 - -// Measure latency of ITERS reads. -__global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, - uint64_t *dsink, uint32_t stride) { - - // thread index - uint32_t tid = threadIdx.x; - uint32_t bid = blockIdx.x; - uint32_t uid = bid * blockDim.x + tid; - uint32_t n_threads = blockDim.x * gridDim.x; - - __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory - - // one thread to initialize the pointer-chasing array - for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads) - s[i] = (i + stride) % SHARED_MEM_SIZE; - - if (uid == 0) { - // initalize pointer chaser - uint64_t p_chaser = 0; - - // start timing - uint32_t start = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - - // pointer-chasing ITERS times - for (uint32_t i = 0; i < ITERS; ++i) { - p_chaser = s[p_chaser]; - } - - // stop timing - uint32_t stop = 0; - asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); - - // write time and data back to memory - startClk[uid] = start; - stopClk[uid] = stop; - dsink[uid] = p_chaser; - } -} - -int main() { - intilizeDeviceProp(0); - - BLOCKS_NUM = 1; - TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; - THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; - - assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); - - uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint64_t))); - - shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1); - gpuErrchk(cudaPeekAtLastError()); - - gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk( - cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); - gpuErrchk( - cudaMemcpy(dsink, dsink_g, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - - float lat = (float)(stopClk[0] - startClk[0]) / ITERS; - printf("Shared Memory Latency = %f cycles\n", lat); - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - - if (ACCEL_SIM_MODE) { - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile deleted file mode 100644 index 82e862792..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = shd_config.cu - -EXE = shd_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu b/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu deleted file mode 100644 index 2009e8bac..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/shd/shd_config/shd_config.cu +++ /dev/null @@ -1,27 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -int main() { - intilizeDeviceProp(0); - - printf("Shared memory per multiprocessor = %lu bytes\n", - deviceProp.sharedMemPerMultiprocessor); - - printf("Shared memory per block = %lu bytes\n", deviceProp.sharedMemPerBlock); - - if (ACCEL_SIM_MODE) { - - std::cout << "\n//Accel_Sim config: \n"; - - std::cout << "-gpgpu_shmem_size " << deviceProp.sharedMemPerMultiprocessor - << std::endl; - std::cout << "-gpgpu_shmem_sizeDefault " - << deviceProp.sharedMemPerMultiprocessor << std::endl; - std::cout << "-gpgpu_shmem_per_block " << deviceProp.sharedMemPerBlock - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile deleted file mode 100644 index 012ae48b7..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = deviceQuery.cpp - -EXE = deviceQuery - -NVCC_FLGAS = - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp b/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp deleted file mode 100644 index 4d8bb318a..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* -Some of the code is adopted from device query benchmark -from CUDA SDK -*/ - -#include -#include - -#include -#include -#include - -int main(int argc, char **argv) { - int deviceCount = 0; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", - static_cast(error_id), cudaGetErrorString(error_id)); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); - } - - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There are no available device(s) that support CUDA\n"); - } - - int dev, driverVersion = 0, runtimeVersion = 0; - - for (dev = 0; dev < deviceCount; ++dev) { - cudaSetDevice(dev); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - - // device - printf(" Device : \"%s\"\n\n", deviceProp.name); - printf(" CUDA version number : %d.%d\n", - deviceProp.major, deviceProp.minor); - - // core - printf(" GPU Max Clock rate : %.0f MHz \n", - deviceProp.clockRate * 1e-3f); - printf(" Multiprocessors Count : %d\n", - deviceProp.multiProcessorCount); - printf(" Maximum number of threads per multiprocessor: %d\n", - deviceProp.maxThreadsPerMultiProcessor); - printf(" CUDA Cores per multiprocessor : %d \n", - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); - printf(" Registers per multiprocessor : %d\n", - deviceProp.regsPerMultiprocessor); - printf(" Shared memory per multiprocessor : %lu bytes\n", - deviceProp.sharedMemPerMultiprocessor); - printf(" Warp size : %d\n", - deviceProp.warpSize); - - // threadblock config - printf(" Maximum number of threads per block : %d\n", - deviceProp.maxThreadsPerBlock); - printf(" Shared memory per block : %lu bytes\n", - deviceProp.sharedMemPerBlock); - printf(" Registers per block : %d\n", - deviceProp.regsPerBlock); - - // L1 cache - printf(" globalL1CacheSupported : %d\n", - deviceProp.globalL1CacheSupported); - printf(" localL1CacheSupported : %d\n", - deviceProp.localL1CacheSupported); - - // L2 cache - if (deviceProp.l2CacheSize) { - printf(" L2 Cache Size : %.0f MB\n", - static_cast(deviceProp.l2CacheSize / 1048576.0f)); - } - - // memory - char msg[256]; - snprintf(msg, sizeof(msg), - " Global memory size : %.0f GB\n", - static_cast(deviceProp.totalGlobalMem / 1073741824.0f)); - printf("%s", msg); - printf(" Memory Clock rate : %.0f Mhz\n", - deviceProp.memoryClockRate * 1e-3f); - printf(" Memory Bus Width : %d bit\n", - deviceProp.memoryBusWidth); - - printf(" ////////////////////////// \n"); - } -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile deleted file mode 100644 index 204366319..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = kernel_lat.cu - -EXE = kernel_lat - -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu b/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu deleted file mode 100644 index c538860e6..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu +++ /dev/null @@ -1,142 +0,0 @@ - -// This benchmark measures the kernel overhead as linear function a + Xb where X -// is the number of launched TBs, a is kernel launch latency and b is TB launch -// latency - -#include -#include -#include -#include - -#include "../../../hw_def/hw_def.h" - -#define THREADS_NUM 1024 -#define ARRAY_SIZE 4096 - -__global__ void kernel_lat_1TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_2TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_4TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_8TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_16TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_32TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_64TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_128TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_256TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_512TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_1024TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -__global__ void kernel_lat_2048TB(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) {} - -int main() { - intilizeDeviceProp(0); - - uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); - uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t)); - - uint32_t *startClk_g; - uint32_t *stopClk_g; - uint64_t *posArray_g; - uint64_t *dsink_g; - - gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t))); - gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t))); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); - - kernel_lat_1TB<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); - - gpuErrchk(cudaPeekAtLastError()); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - - /* - kernel_lat_2TB<<<2,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g); - gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_4TB<<<4,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g); - gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_8TB<<<8,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, dsink_g); - gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_16TB<<<16,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_32TB<<<32,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - - kernel_lat_64TB<<<64,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_128TB<<<128,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - - kernel_lat_256TB<<<256,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_512TB<<<1024,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_1024TB<<<1024,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - - kernel_lat_2048TB<<<2048,THREADS_NUM>>>(startClk_g, stopClk_g, posArray_g, - dsink_g); gpuErrchk( cudaPeekAtLastError() ); - -*/ - - gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(stopClk, stopClk_g, THREADS_NUM * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - - float lat = (milliseconds * 1000 * CLK_FREQUENCY) / 3; - std::cout << "Kernel Launch Latency = " << lat << " cycles\n"; - std::cout << "The reported latency above can be slightly higher than real. " - "For accurate evaultion using nvprof event, exmaple: make " - "events ./kernel_lat\n"; - - if (ACCEL_SIM_MODE) { - std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_kernel_launch_latency " << (unsigned)(lat) - << std::endl; - } - - return 1; -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile deleted file mode 100644 index a0e8ce8ff..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -SRC = list_devices.cpp - -EXE = list_devices - -NVCC_FLGAS = - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp b/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp deleted file mode 100644 index be29463e4..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/list_devices/list_devices.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* -Some of the code is adopted from device query benchmark -from CUDA SDK -*/ - -// std::system includes - -#include - -#include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -// Program main -//////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - - int deviceCount = 0; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", - static_cast(error_id), cudaGetErrorString(error_id)); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); - } - - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There are no available device(s) that support CUDA\n"); - } - - for (int dev = 0; dev < deviceCount; ++dev) { - cudaSetDevice(dev); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - - printf("\nDevice %d: \"%s sm_%d.%d\"\n", dev, deviceProp.name, - deviceProp.major, deviceProp.minor); - } -} diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile b/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile deleted file mode 100644 index e3a5c7a7f..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -SRC = system_config.cu - -EXE = system_config - -include ../../../common/common.mk diff --git a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu b/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu deleted file mode 100644 index 152133edc..000000000 --- a/util/tuner/GPU_Microbenchmark/ubench/system/system_config/system_config.cu +++ /dev/null @@ -1,34 +0,0 @@ -#include -using namespace std; - -#include "../../../hw_def/hw_def.h" - -int main() { - intilizeDeviceProp(0); - - printf("Device Name = %s\n", deviceProp.name); - printf("GPU Max Clock rate = %.0f MHz \n", deviceProp.clockRate * 1e-3f); - printf("GPU Base Clock rate = %d MHz \n", CLK_FREQUENCY); - printf("SM Count = %d\n", deviceProp.multiProcessorCount); - printf("CUDA version number = %d.%d\n", deviceProp.major, deviceProp.minor); - - if (ACCEL_SIM_MODE) { - - std::cout << "\n//Accel_Sim config: \n"; - - float mem_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) / - dram_model_freq_ratio[DRAM_MODEL]; - std::cout << "-gpgpu_compute_capability_major " << deviceProp.major - << std::endl; - std::cout << "-gpgpu_compute_capability_minor " << deviceProp.minor - << std::endl; - std::cout << "-gpgpu_n_clusters " << deviceProp.multiProcessorCount - << std::endl; - std::cout << "-gpgpu_n_cores_per_cluster 1" << std::endl; - std::cout << "-gpgpu_clock_domains " << CLK_FREQUENCY << ":" - << CLK_FREQUENCY << ":" << CLK_FREQUENCY << ":" << mem_freq_MHZ - << std::endl; - } - - return 1; -} diff --git a/util/tuner/README.md b/util/tuner/README.md index 3f0ea34a6..a20da2474 100644 --- a/util/tuner/README.md +++ b/util/tuner/README.md @@ -17,21 +17,26 @@ cache hashing function), we do an extensive searching by simulating each possibl # Tuning Steps: The following steps demonstrate how to tune the Accel-Sim config files to a specific GPU hardware. We assume that you already have the GPU hardware in question. + +0. **Get Microbenchmarks** + ```bash + ./get_ubench.sh + ``` 1. **Provide HW def file and run microbenchmarks**: -You need to provide a C header file `hw_def` that contains minimal information about the hardware model. This file is used to configure and tune the microbenchmarks for the unduerline hardware. See an example of Ampere RTX 3060 card [here](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tuner/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h). These information can be gathered from Nvidia whitepaper and public website. -After you write the HW file for the underline card, ensure to add it in [/GPU_Microbenchmark/hw_def/hw_def.h](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tuner/GPU_Microbenchmark/hw_def/hw_def.h). +You need to provide a C header file `hw_def` that contains minimal information about the hardware model. This file is used to configure and tune the microbenchmarks for the unduerline hardware. See an example of Ampere RTX 3060 card [here](gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h). These information can be gathered from Nvidia whitepaper and public website. +After you write the HW file for the underline card, ensure to add it in [/GPU_Microbenchmark/hw_def/hw_def.h](gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h). Then, compile microbenchmarks and run: ```bash # Make sure PATH includes nvcc - # If your hardware has new compute capability, ensure to add it in the /GPU_Microbenchmark/common/common.mk + # If your hardware has new compute capability, ensure to add it in the ./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark//common/common.mk # compile microbenchmarks - make -C ./GPU_Microbenchmark/ + make -C ./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/ # set the device id that you want to tune to # if you do not know the device id, run ./GPU_Microbenchmark/bin/list_devices export CUDA_VISIBLE_DEVICES=0 #run the ubench and save output in stats.txt - ./GPU_Microbenchmark/run_all.sh | tee stats.txt + ./run_all.sh | tee stats.txt ``` 2. **Run the tuner**: The tuner.py script will parse the microbenchmarks output and generate a folder of the HW device name (e.g. "TITAN_V"). The folder will contain the config files for GPGPU-Sim performance model and Accel-Sim trace-driven front-end (gpgpusim.config and trace.config files) diff --git a/util/tuner/get_ubench.sh b/util/tuner/get_ubench.sh new file mode 100755 index 000000000..584879aef --- /dev/null +++ b/util/tuner/get_ubench.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Configuration +REPO_URL="https://github.com/accel-sim/gpu-app-collection.git" +CLONE_DIR="gpu-app-collection-partial" +BRANCH="dev" +SPARSE_PATHS=( + "src/cuda/GPU_Microbenchmark" + "src/cuda/cuda-samples" +) + +# Step 1: Clone repo with sparse checkout enabled +git clone --recurse-submodules -j8 --filter=blob:none --no-checkout -b "$BRANCH" "$REPO_URL" "$CLONE_DIR" +cd "$CLONE_DIR" + +# Step 2: Enable sparse checkout +git sparse-checkout init --cone +git sparse-checkout set "${SPARSE_PATHS[@]}" +git checkout + +# Step 3: Manually initialize the submodule (if not already checked out) +git submodule update --init --recursive -- src/cuda/cuda-samples diff --git a/util/tuner/run_all.sh b/util/tuner/run_all.sh new file mode 100755 index 000000000..93dcf16e0 --- /dev/null +++ b/util/tuner/run_all.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )" +SCRIPT_DIR="./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/" +echo "Running make in $SCRIPT_DIR" +make -C "$SCRIPT_DIR" tuner -j || { echo "make failed"; exit 1; } + +cd ${SCRIPT_DIR}/bin/ +for f in ./*; do + if [[ "$f" == *_corr ]]; then + continue + fi + + echo "running $f microbenchmark" + $f + echo "/////////////////////////////////" +done \ No newline at end of file