diff --git a/.gitignore b/.gitignore index 21bd1f0b6..4ccfa6ce6 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ nv_gpu_build ios_build gpu_build output +third-party/extern_miopen diff --git a/CMakeLists.txt b/CMakeLists.txt index 050da5434..45ea6a020 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,8 @@ anakin_option(USE_MKL "Use mkl libs." NO if USE_X86_PLACE) anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE) anakin_option(USE_XBYAK "Use XBYAK libs." YES if USE_X86_PLACE) anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID) +anakin_option(USE_OPENSSL "Use OpenSSL." YES if AMD_GPU) +anakin_option(USE_MIOPENGEMM "Use MIOpenGEMM ." YES if AMD_GPU) # build components anakin_option(BUILD_WITH_UNIT_TEST "Build anakin unit test components." YES) @@ -148,6 +150,10 @@ if(ENABLE_MIN_DEPENDENCY) set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--version-script,${ANAKIN_ROOT}/cmake/ak_link.lds") endif() +# for AMD profile computing performance +anakin_option(ENABLE_AMD_PROFILING "Enable amd profiling" NO) +anakin_option(ENABLE_AMD_DEBUG "Enable amd debug" NO) + # ---------------------------------------------------------------------------- # section: anakin compiler and linker options # ---------------------------------------------------------------------------- @@ -201,6 +207,7 @@ endif() if(AMD_GPU) include(cmake/amd.cmake) + include(cmake/external/miopen.cmake) endif() # gather all the config options to anakin diff --git a/cmake/amd.cmake b/cmake/amd.cmake index 1ebc7bf56..8d89f917c 100644 --- a/cmake/amd.cmake +++ b/cmake/amd.cmake @@ -30,24 +30,77 @@ macro(amd_set_opencl_path) #endif() endmacro() -macro(amd_build_cl_file file_path dest_path) - FILE(GLOB CL_FILES ${file_path}/*.cl) - message(STATUS "found cl files: ${CL_FILES}") - foreach(src_file ${CL_FILES}) - get_filename_component(src_file_name ${src_file} NAME) - message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}") - configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY) - endforeach() +macro(amd_find_file regexps) + FILE(GLOB FOUND_FILES ${regexps}) endmacro() +function(add_kernels KERNEL_FILES) + set(INIT_KERNELS_LIST) -macro(amd_build_cl_binary_file file_path dest_path) - FILE(GLOB CL_FILES ${file_path}/*.so) - message(STATUS "found cl files: ${CL_FILES}") - foreach(src_file ${CL_FILES}) - get_filename_component(src_file_name ${src_file} NAME) - message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}") - configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY) + foreach(KERNEL_FILE ${KERNEL_FILES}) + if("${CMAKE_VERSION}" VERSION_LESS 3.0) + configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) + else() + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) + endif() + get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) + string(TOUPPER "${BASE_NAME}" KEY_NAME) + string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) + list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast(${VAR_NAME}), ${VAR_NAME}_SIZE) }") endforeach() + string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") + configure_file("${CMAKE_SOURCE_DIR}/saber/core/impl/amd/utils/amd_kernels.cpp.in" ${PROJECT_BINARY_DIR}/amd_kernels.cpp) +endfunction() + +macro(generate_amd_kernel_src) + set(AMD_KERNELS) + set(AMD_KERNEL_INCLUDES) + + set(FILE_REGEXP + "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.cl" + "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.s" + "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib/*.so" + "${CMAKE_SOURCE_DIR}/test/saber/amd/*.cl" + ) + amd_find_file("${FILE_REGEXP}") + list(APPEND AMD_KERNELS ${FOUND_FILES}) + + set(FILE_REGEXP + "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.inc" + ) + amd_find_file("${FILE_REGEXP}") + list(APPEND AMD_KERNEL_INCLUDES ${FOUND_FILES}) + + #message(STATUS =======${AMD_KERNELS}======) + #message(STATUS =======${AMD_KERNEL_INCLUDES}======) + add_kernels("${AMD_KERNELS}") + add_custom_command( + OUTPUT ${PROJECT_BINARY_DIR}/amd_kernels.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS ${MIOPEN_PROJECT} ${AMD_KERNELS} ${AMD_KERNEL_INCLUDES} + COMMAND ${MIOPEN_BINARY_DIR}/bin/addkernels -guard GUARD_AMD_KERNELS_HPP_ -target ${PROJECT_BINARY_DIR}/amd_kernels.h -source ${AMD_KERNELS} + COMMENT "Inlining AMD kernels" + ) + add_custom_target(amd_kernels DEPENDS ${PROJECT_BINARY_DIR}/amd_kernels.h) + include_directories($PROJECT_BINARY_DIR}/amd_kernels.h) + list(APPEND ANAKIN_SABER_DEPENDENCIES amd_kernels) + list(APPEND ANAKIN_SABER_BASE_SRC ${PROJECT_BINARY_DIR}/amd_kernels.cpp) +endmacro() + +macro(amd_set_miopengemm_path) + if(NOT DEFINED MIOPENGEMM_INCLUDE_DIR) + set(MIOPENGEMM_INCLUDE_DIR "/opt/rocm/miopengemm/include") + endif() + if(NOT DEFINED MIOPENGEMM_LIBRARY) + set(MIOPENGEMM_LIBRARY "/opt/rocm/miopengemm/lib64/libmiopengemm.so") + endif() + + #FIND_PACKAGE(miopengemm REQUIRED) + #if(miopengemm_FOUND) + # message(STATUS "Found miopengemm in ${MIOPENGEMM_INCLUDE_DIR}") + # message(STATUS "Found miopengemm lib in ${MIOPENGEMM_LIBRARY}") + # include_directories(${MIOPENGEMM_INCLUDE_DIR}) + # LINK_LIBRARIES(${MIOPENGEMM_LIBRARY}) + #endif() endmacro() diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in index d96e231bf..92aa8010e 100644 --- a/cmake/config/anakin_config.h.in +++ b/cmake/config/anakin_config.h.in @@ -55,6 +55,7 @@ #cmakedefine USE_GFLAGS +#cmakedefine USE_MIOPENGEMM // plantform to use #cmakedefine USE_GPU_PLACE @@ -83,6 +84,10 @@ // build arm lite #cmakedefine BUILD_LITE +// for AMD profiling computing performance +#cmakedefine ENABLE_AMD_PROFILING + +#cmakedefine ENABLE_AMD_DEBUG #if defined(ANDROID) || defined(__ANDROID__) #define PLATFORM_ANDROID @@ -114,4 +119,3 @@ #endif #endif - diff --git a/cmake/external/miopen.cmake b/cmake/external/miopen.cmake old mode 100644 new mode 100755 index e76acb428..71c2a53f9 --- a/cmake/external/miopen.cmake +++ b/cmake/external/miopen.cmake @@ -14,50 +14,27 @@ include(ExternalProject) set(MIOPEN_PROJECT extern_miopen) -set(MIOPEN_PREFIX_DIR ${ANAKIN_TEMP_THIRD_PARTY_PATH}/miopen) -set(MIOPEN_INSTALL_ROOT ${ANAKIN_THIRD_PARTY_PATH}/miopen) -set(MIOPEN_SOURCE_DIR ${MIOPEN_PREFIX_DIR}/src/${MIOPEN_PROJECT}) -set(MIOPEN_BINARY_DIR ${MIOPEN_PREFIX_DIR}/src/${MIOPEN_PROJECT}-build) +set(MIOPEN_PREFIX_DIR ${ANAKIN_THIRD_PARTY_PATH}) +set(MIOPEN_INSTALL_ROOT ${ANAKIN_ROOT}/output/miopen) +set(MIOPEN_SOURCE_DIR ${ANAKIN_THIRD_PARTY_PATH}/${MIOPEN_PROJECT}) +set(MIOPEN_STAMP_DIR ${ANAKIN_TEMP_THIRD_PARTY_PATH}/${MIOPEN_PROJECT}/stamp) +set(MIOPEN_BINARY_DIR ${ANAKIN_TEMP_THIRD_PARTY_PATH}/${MIOPEN_PROJECT}/build) set(MIOPEN_LIB ${MIOPEN_INSTALL_ROOT}/lib/libMIOpen.so CACHE FILEPATH "miopen library." FORCE) -if(NOT Boost_FOUND) - set(BOOST_ROOT ${BOOST_INSTALL_ROOT} CACHE FILEPATH "boost library/" FORCE) -endif() - message(STATUS "Scanning external modules ${Green}MIOPEN${ColourReset} ...") -ExternalProject_Add( - ${MIOPEN_PROJECT}_customize - GIT_REPOSITORY "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen" - GIT_TAG "cbd4e7dbad0599c7327cb43888476ab8d966f285" - PREFIX ${ANAKIN_TEMP_THIRD_PARTY_PATH}/miopen/customize_miopen_file - SOURCE_DIR ${ANAKIN_THIRD_PARTY_PATH}/miopen/customize_miopen_file - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" -) - ExternalProject_Add( ${MIOPEN_PROJECT} - DEPENDS ${MIOPEN_PROJECT}_customize - GIT_REPOSITORY "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen" - GIT_TAG 1.4.2 + GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/Anakin.git + GIT_TAG anakin-amd_miopen PREFIX ${MIOPEN_PREFIX_DIR} - CMAKE_ARGS -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX=${MIOPEN_INSTALL_ROOT} -DCMAKE_INSTALL_LIBDIR=lib -DBOOST_ROOT=${BOOST_ROOT} - #LOG_DOWNLOAD 1 + STAMP_DIR ${MIOPEN_STAMP_DIR} + BINARY_DIR ${MIOPEN_BINARY_DIR} + CMAKE_ARGS -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX=${MIOPEN_INSTALL_ROOT} -DCMAKE_INSTALL_LIBDIR=lib LOG_BUILD 1 ) -ExternalProject_Add_Step( - ${MIOPEN_PROJECT} ${MIOPEN_PROJECT}_customize - DEPENDEES download - DEPENDERS build - COMMAND ${CMAKE_COMMAND} -E copy_directory ${ANAKIN_THIRD_PARTY_PATH}/miopen/customize_miopen_file ${MIOPEN_SOURCE_DIR} - ALWAYS 1 - EXCLUDE_FORM_MAIN 1 - LOG 1 -) include_directories(${MIOPEN_INSTALL_ROOT}/include) add_library(miopen SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET miopen PROPERTY IMPORTED_LOCATION ${MIOPEN_LIB}) diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake index 2f0790b5b..62d0fc8ae 100644 --- a/cmake/find_modules.cmake +++ b/cmake/find_modules.cmake @@ -440,10 +440,19 @@ macro(anakin_find_bmlib) list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so) list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) else() - message(FATAL_ERROR "Could not found bm_lib") - endif() + message(FATAL_ERROR "Could not found bm_lib") endmacro() +macro(anakin_find_openssl) + find_package(OpenSSL REQUIRED) + if(OPENSSL_FOUND) + message(STATUS "Found openssl in ${OPENSSL_INCLUDE_DIR}") + list(APPEND ANAKIN_LINKER_LIBS ${OPENSSL_LIBRARIES}) + include_directories(${OPENSSL_INCLUDE_DIR}) + else() + message(FATAL_ERROR "Could not found openmp !") + endif() +endmacro() macro(anakin_find_nvinfer) find_path(NVINFER_INCLUDE_DIR NvInfer.h PATHS ${ANAKIN_ROOT}/third-party/tensorrt5/include @@ -488,3 +497,14 @@ macro(anakin_find_nvinfer) endif() endmacro() +#find miopengemm +macro(anakin_find_miopengemm) + find_package(miopengemm REQUIRED PATHS /opt/rocm) + if(miopengemm_FOUND) + set(MIOPENGEMM_FOUND TRUE) + message(STATUS "Found miopengemm: ${miopengemm_INCLUDE_DIR}") + include_directories(SYSTEM ${miopengemm_INCLUDE_DIR}) + list(APPEND ANAKIN_LINKER_LIBS ${miopengemm_LIBRARIES}) + endif() +endmacro() + diff --git a/cmake/gather.cmake b/cmake/gather.cmake index 32b03c05f..1754243ea 100644 --- a/cmake/gather.cmake +++ b/cmake/gather.cmake @@ -28,13 +28,14 @@ if(USE_BM_PLACE) anakin_find_bmlib() endif() -# set amd opencl path -if(AMD_GPU) - amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${CMAKE_BINARY_DIR}/cl/amd") - amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${CMAKE_BINARY_DIR}/cl/amd") - amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test") - amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test") - amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test") +# find openssl +if(USE_OPENSSL) + anakin_find_openssl() +endif() + +# find openssl +if(USE_OPENSSL) + anakin_find_openssl() endif() # find opencl @@ -105,3 +106,8 @@ endif() if(USE_TENSORRT) anakin_find_nvinfer() endif() + +# find miopengemm +if(USE_MIOPENGEMM) + anakin_find_miopengemm() +endif() diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 86d4b0836..05efe5f0c 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -42,6 +42,11 @@ if(USE_GPU_PLACE) else() anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/amd "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd/utils "cpp" ANAKIN_SABER_BASE_SRC) + if(USE_OPENCL) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd/utils/ocl "cpp" ANAKIN_SABER_BASE_SRC) + endif() + generate_amd_kernel_src() endif() endif() @@ -92,7 +97,7 @@ if(UNIX OR APPLE) if (BUILD_SHARED) ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) #$) - if(USE_X86_PLACE OR USE_CUDA) + if(USE_X86_PLACE OR USE_CUDA OR AMD_GPU) list(LENGTH ANAKIN_SABER_DEPENDENCIES dependencies_len) if(dependencies_len GREATER 0) add_dependencies(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES}) diff --git a/saber/core/device.h b/saber/core/device.h index ced61d6a8..f3baef1b0 100644 --- a/saber/core/device.h +++ b/saber/core/device.h @@ -63,7 +63,7 @@ struct Device { typedef TargetWrapper API; - Device(int max_stream = 1); + Device(int max_stream = 4); void get_info(); void create_stream(); diff --git a/saber/core/env.h b/saber/core/env.h index edd72a3a4..ad225956c 100644 --- a/saber/core/env.h +++ b/saber/core/env.h @@ -56,55 +56,6 @@ class Env { Env(){} }; -#ifdef AMD_GPU -typedef std::list cl_event_list; - -template <> -class Env { -public: - typedef TargetWrapper AMD_API; - typedef std::vector> Devs; - static Devs& cur_env() { - static Devs* _g_env = new Devs(); - return *_g_env; - } - - static void env_init(int max_stream = 4); - static bool is_init(); - static cl_platform_id get_platform_id(); - - static void add_event(const char *tag, cl_event_list event); - static void add_event(cl_event_list event) { - add_event(mTag.c_str(), event); - } - - static void pop(); - static void set_tag(const char *tag){ - mTag = std::string(tag); - } - - static const std::string& get_tag(){ - return mTag; - } - - static void start_record(){ - record = true; - } - static void stop_record(){ - record = false; - } -private: - Env(){} - - static cl_platform_id platform_id; - static std::map> eMap; - static std::list tList; - static bool record; - static std::string mTag; - -}; -#endif - } //namespace saber } //namespace anakin diff --git a/saber/core/impl/amd/amd_device.cpp b/saber/core/impl/amd/amd_device.cpp index b3543ac23..6477e3c50 100644 --- a/saber/core/impl/amd/amd_device.cpp +++ b/saber/core/impl/amd/amd_device.cpp @@ -54,16 +54,14 @@ static void get_param(cl_device_id dev, cl_device_info param_name, T** param_val Device::Device(int max_stream) : _max_stream(max_stream) { - if (!Env::is_init()) { - return; - } //get cl device id; int nums = 0; AMD_API::get_device_count(nums); cl_device_id* device_ids = new cl_device_id[nums]; cl_uint device_nums; - clGetDeviceIDs(Env::get_platform_id(), CL_DEVICE_TYPE_GPU, (cl_uint)nums, device_ids, + cl_platform_id platform_id = TargetWrapper::get_platform_id(); + clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, (cl_uint)nums, device_ids, &device_nums); id = device_ids[AMD_API::get_device_id()]; delete []device_ids; @@ -71,12 +69,11 @@ Device::Device(int max_stream) : _max_stream(max_stream) { //init context, one by one mapping to device. cl_int errNum; - const cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)Env::get_platform_id(), 0}; + const cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0}; context = clCreateContext(prop, 1, &id, NULL, NULL, &errNum); CHECK(errNum == CL_SUCCESS); get_info(); - create_stream(); } void Device::create_stream() { @@ -87,8 +84,13 @@ void Device::create_stream() { typename AMD_API::stream_t stream_data; typename AMD_API::stream_t stream_compute; +#ifdef ENABLE_AMD_PROFILING API::_create_stream_with_flag(&stream_data, context, id, CL_QUEUE_PROFILING_ENABLE); API::_create_stream_with_flag(&stream_compute, context, id, CL_QUEUE_PROFILING_ENABLE); +#else + API::_create_stream_with_flag(&stream_data, context, id, 0); + API::_create_stream_with_flag(&stream_compute, context, id, 0); +#endif _data_stream.push_back(stream_data); _compute_stream.push_back(stream_compute); } diff --git a/saber/core/impl/amd/amd_env.cpp b/saber/core/impl/amd/amd_env.cpp deleted file mode 100644 index 6395de384..000000000 --- a/saber/core/impl/amd/amd_env.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#include "core/env.h" -namespace anakin { - -namespace saber { - -#ifdef AMD_GPU - -typedef TargetWrapper AMD_API; -typedef Env AMD_ENV; - -cl_platform_id AMD_ENV::platform_id = NULL; - -void AMD_ENV::env_init(int max_stream) { - Devs& devs = cur_env(); - - if (devs.size() > 0) { - return; - } - - platform_id = AMD_API::get_platform_id(); - - int count = 0; - AMD_API::get_device_count(count); - - if (count == 0) { - LOG(WARNING) << "no device found!"; - } else { - LOG(INFO) << "found " << count << " device(s)"; - } - - int cur_id = AMD_API::get_device_id(); - - for (int i = 0; i < count; i++) { - AMD_API::set_device(i); - devs.push_back(Device(max_stream)); - } - - AMD_API::set_device(cur_id); -} - -bool AMD_ENV::is_init() { - CHECK(platform_id != NULL); - return true; -} - -cl_platform_id AMD_ENV::get_platform_id() { - if (!is_init()) { - return NULL; - } - - return platform_id; -} - -bool AMD_ENV::record = false; -std::string AMD_ENV::mTag; -std::list AMD_ENV::tList; -std::map> AMD_ENV::eMap; - -void AMD_ENV::add_event(const char* tag, cl_event_list event) { - if (!record) { - return; - } - - std::map>::iterator it; - it = eMap.find(std::string(tag)); - - if (it != eMap.end()) { - it->second.push_back(event); - } else { - LOG(INFO) << "record [" << tList.size() << "]=" << tag; - tList.push_back(std::string(tag)); - std::list list; - list.push_back(event); - eMap[std::string(tag)] = list; - } -} -void AMD_ENV::pop() { - std::map>::iterator it; - size_t t_size; - size_t e_size; - size_t s_size; - size_t size; - float executionTime = 0; - float waitTime = 0; - float g_execute = 0; - float g_wait = 0; - cl_ulong submit; - cl_ulong start; - cl_ulong end; - cl_ulong wait; - cl_ulong execute; - CHECK(tList.size() == eMap.size()); - t_size = tList.size(); - std::string log; - log.append("\n"); - std::string tmp; - - for (int i = 0 ; i < t_size; i++) { - waitTime = executionTime = 0; - std::string tag = tList.front(); - it = eMap.find(tag); - std::list list = it->second; - e_size = list.size(); - - cl_ulong* s_waits = NULL, *s_executes = NULL; - - for (int j = 0 ; j < e_size; j++) { - cl_event_list eList = list.front(); - - s_size = eList.size(); - - if (s_size > 1) { - - if (s_waits == NULL) { - s_waits = new cl_ulong[s_size]; - s_executes = new cl_ulong[s_size]; - memset(s_waits, 0, s_size * sizeof(cl_ulong)); - memset(s_executes, 0, s_size * sizeof(cl_ulong)); - } - - int tmps = 0; - - for (cl_event_list::iterator ite = eList.begin(); ite != eList.end(); ite++) { - - cl_event event = *ite; - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); - - s_waits[tmps] += (start - submit); - s_executes[tmps] += (end - start); - tmps++; - } - - cl_event eventS = eList.front(); - cl_event eventE = eList.back(); - clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); - execute = end - start; - - clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); - wait = start - submit; - - clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL); - clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL); - wait += start - end; - } else { - - cl_event event = eList.front(); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); - - wait = start - submit; - execute = end - start; - } - - eList.clear(); - - executionTime += (execute) * 1e-6; - waitTime += (wait) * 1e-6; - list.pop_front(); - //LOG(INFO) << tag << ":" << "["<< wait <<", " << execute <<"]"; - } - - //LOG(INFO) << "[" << i << "]" << tag << " avg - wait :"<< waitTime/e_size << " ms, execute " << executionTime/e_size <<" ms"; - tmp = std::string("[") + std::to_string(i) + std::string("]\t") + \ - tag + std::string("\t") + std::to_string(executionTime / e_size) + std::string(" ms\n"); - - if (s_size > 1) { - for (int s = 0; s < s_size; s++) { - tmp.append(std::string("--[") + std::to_string(i) + std::string("-") + std::to_string( - s) + std::string("]\t\t")); - tmp.append(std::to_string((float)s_executes[s] * 1e-6 / e_size) + std::string(" ms\n")); - } - - delete []s_waits; - delete []s_executes; - s_waits = NULL; - s_executes = NULL; - } - - log.append(tmp); - - g_wait += (waitTime / e_size); - g_execute += (executionTime / e_size); - //LOG(INFO) << "[" << i << "]" << tag << '\t' << " avg - execute " << '\t'<< executionTime/e_size <<" ms"; - - tList.pop_front(); - } - - tmp = std::string("[Total]\t\t") + \ - std::to_string(g_execute) + std::string(" ms\n"); - log.append(tmp); - LOG(INFO) << log; -} - - - -//template void AMD_ENV::evn_init(); - -#endif // AMD_GPU - -} //namespace saber -} //namespace anakin - diff --git a/saber/core/impl/amd/amd_impl.cpp b/saber/core/impl/amd/amd_impl.cpp index a22146104..f6d6d3732 100644 --- a/saber/core/impl/amd/amd_impl.cpp +++ b/saber/core/impl/amd/amd_impl.cpp @@ -1,8 +1,23 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "core/tensor.h" #include "core/common.h" #include "core/buffer.h" #include "core/data_traits.h" #include "env.h" +#include "utils/amd_logger.h" namespace anakin { @@ -49,15 +64,13 @@ const char* opencl_get_error_string(cl_int err) { } return "Unknown cl error"; - } - /** * \brief for AMD device target only, device target is AMD gpu * use opencl api to manage memory * support device to device, device to host, host to device memcpy -*/ + */ typedef TargetWrapper AMD_API; typedef Env AMD_ENV; @@ -65,7 +78,7 @@ int AMD_API::current_device_id_index = 0; std::map AMD_API::buffers; void AMD_API::get_device_count(int& count) { - cl_platform_id id = AMD_ENV::get_platform_id(); + cl_platform_id id = AMD_API::get_platform_id(); cl_uint nums; AMD_CHECK(clGetDeviceIDs(id, CL_DEVICE_TYPE_GPU, 0, NULL, &nums)); count = (int)nums; @@ -77,12 +90,11 @@ void AMD_API::set_device(int id) { } void AMD_API::mem_alloc(TPtr* ptr, size_t n) { - AMD_ENV::is_init(); -#ifdef AMD_GPU_EXTENSION - //LOG(INFO) << "use CL_MEM_USE_PERSISTENT_MEM_AMD to create buffer."; +#ifdef AMD_GPU_EXTENSION + // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "use CL_MEM_USE_PERSISTENT_MEM_AMD to create buffer."; #else - //LOG(INFO) << "use CL_MEM_ALLOC_HOST_PTR to create buffer."; + // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "use CL_MEM_ALLOC_HOST_PTR to create buffer."; #endif int index = get_device_id(); @@ -90,24 +102,31 @@ void AMD_API::mem_alloc(TPtr* ptr, size_t n) { cl_context context = AMD_ENV::cur_env()[index].get_context(); cl_int err; - cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE + cl_mem buf = clCreateBuffer( + context, + CL_MEM_READ_WRITE #ifdef AMD_GPU_EXTENSION - | CL_MEM_USE_PERSISTENT_MEM_AMD + | CL_MEM_USE_PERSISTENT_MEM_AMD #else - | CL_MEM_ALLOC_HOST_PTR + | CL_MEM_ALLOC_HOST_PTR #endif - , n, NULL, &err); + , + n, + NULL, + &err); AMD_CHECK(err); *ptr = buf; - LOG(INFO) << __func__ << "device =" << index << " get context :" << context << " buffer :" << buf << - " size :" << n; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << "device =" << index << " get context :" << + context << " buffer :" << buf + << " size :" << n; } void AMD_API::mem_free(TPtr ptr) { if (ptr != nullptr) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " buffer :" << ptr; clReleaseMemObject(ptr); } } @@ -116,27 +135,23 @@ void AMD_API::mem_free(TPtr ptr) { void AMD_API::mem_set(TPtr ptr, int value, size_t n) { if (ptr == nullptr) { - return ; + return; } - AMD_ENV::is_init(); - Device dev = AMD_ENV::cur_env()[current_device_id_index]; - stream_t cm = dev.get_available_stream(); + stream_t cm = dev.get_available_stream(); clEnqueueFillBuffer(cm, ptr, &value, sizeof(int), 0, n, 0, NULL, NULL); } #else -template +template void AMD_API::mem_set(TPtr ptr, U value, size_t n) { if (ptr == nullptr) { - return ; + return; } - AMD_ENV::is_init(); - Device dev = AMD_ENV::cur_env()[current_device_id_index]; stream_t cm = dev.get_available_stream(stream); @@ -148,14 +163,12 @@ void AMD_API::mem_set(TPtr ptr, U value, size_t n) { void AMD_API::create_event(event_t* event, bool flag) { - LOG(INFO) << "create_event break opencl call sequence. Is baidu expect clCreateUserEvent?"; - //do nothing for this. + // do nothing for this. *event = nullptr; - //Env::is_init(); - //cl_int err = CL_SUCCESS; - //event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err); - //AMD_CHECK(err); + // cl_int err = CL_SUCCESS; + // event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err); + // AMD_CHECK(err); } void AMD_API::create_stream(stream_t* stream) { @@ -168,7 +181,6 @@ void AMD_API::create_stream(stream_t* stream) { * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking */ void AMD_API::create_stream_with_flag(stream_t* stream, unsigned int flag) { - Env::is_init(); cl_int err = CL_SUCCESS; if (!stream) { @@ -204,13 +216,12 @@ void AMD_API::destroy_stream(stream_t stream) { } void AMD_API::destroy_event(event_t event) { - // LOG(INFO) << __func__ <<" :Does baidu expect this event is an User Event?"; - if (event == nullptr) { - // LOG(INFO) << "event is empty, do nothing"; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "event is empty, do nothing"; return; } + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << event; cl_command_type t; AMD_CHECK(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &t, NULL)); @@ -221,51 +232,53 @@ void AMD_API::destroy_event(event_t event) { if (refs == 1) { AMD_CHECK(clSetUserEventStatus(event, CL_COMPLETE)); } - - AMD_CHECK(clReleaseEvent(event)); - } else { - // LOG(INFO) << "NOT User Event, do nothing"; } + AMD_CHECK(clReleaseEvent(event)); } -void AMD_API::record_event(event_t event, stream_t stream) { - //LOG(WARNING) << "OpenCL record event when calling clEnqueueXXX, so we use marker to simulate this behavior"; +void AMD_API::record_event(event_t& event, stream_t stream) { + // LOG(WARNING) << "OpenCL record event when calling clEnqueueXXX, so we use marker to simulate + // this behavior"; AMD_CHECK(clEnqueueMarkerWithWaitList(stream, 0, NULL, &event)); - //LOG(INFO) << "marker event "<< event; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "marker event " << event; } void AMD_API::query_event(event_t event) { // TODO - LOG(ERROR) << - "OpenCL us clGetEventInfo to retrive event's specific info. so we need to know what info user want to know"; + LOG(WARNING) << "device target AMD \" query_event\" is not implemented"; } void AMD_API::sync_event(event_t event) { - // LOG(INFO) << __func__ ; if (event == nullptr) { - LOG(INFO) << "event is empty, do nothing"; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " event is empty, do nothing"; return; } - // LOG(INFO) << "sync_event E " << event; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "sync_event E " << event; AMD_CHECK(clWaitForEvents(1, &event)); - // LOG(INFO) << "sync_event X " << event; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "sync_event X " << event; } void AMD_API::sync_stream(event_t event, stream_t stream) { - LOG(INFO) << __func__ ; - - if (event != nullptr) { - LOG(INFO) << "event is null"; + if (stream == NULL) { + LOG(INFO) << __func__ << " stream is empty, do nothing"; return; } - LOG(INFO) << "sync_stream E "; - AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, NULL)); + LOG(INFO) << "sync_stream E"; + event_t revent; + + if (event == NULL) { + AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 0, NULL, &revent)); + } else { + AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, &revent)); + } + clFlush(stream); - LOG(INFO) << "sync_stream D "; + AMD_API::sync_event(revent); + LOG(INFO) << "sync_stream D"; } #if 0 @@ -314,8 +327,14 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, \ size_t count, __DtoD) { // TODO - LOG(INFO) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + + if (count == 0) { + return; + } cl_mem dst_mem = (cl_mem)dst; cl_mem src_mem = (cl_mem)src; @@ -327,6 +346,7 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ &event)); clFlush(cm); clWaitForEvents(1, &event); + clReleaseEvent(event); LOG(INFO) << "OpenCL, sync, D2D, size: " << count; } else { cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); @@ -343,9 +363,9 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ clFlush(src_cm); clFlush(dst_cm); clWaitForEvents(1, &event); + clReleaseEvent(event); LOG(INFO) << "OpenCL, sync, P2P, size: " << count; } - } #endif @@ -360,9 +380,10 @@ void AMD_API::async_memcpy(TPtr dst, int dst_id, const TPtr src, int src_id, \ size_t src_offset = src.offset; //async_memcpy_with_offset(dst, dst_id, 0, src, src_id, 0, count, stream, __DtoD()); - LOG(INFO) << __func__ << " D2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" << - dst_offset << " src=" << (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << - " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << (void*)dst_mem << " dst_id=" << + dst_id << " dst_office=" << + dst_offset << " src=" << (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << + " count=" << count; //cl_mem dst_mem = (cl_mem) dst; //cl_mem src_mem = (cl_mem) src; @@ -395,15 +416,22 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, \ size_t count, stream_t stream, __DtoD) { - LOG(INFO) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + + if (count == 0) { + return; + } cl_mem dst_mem = (cl_mem)dst; cl_mem src_mem = (cl_mem)src; if (dst_id == src_id) { cl_command_queue cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream); - AMD_CHECK(clEnqueueCopyBuffer(cm, src_mem, dst_mem, src_offset, dst_offset, count, 0, NULL, NULL)); + AMD_CHECK(clEnqueueCopyBuffer( + cm, src_mem, dst_mem, src_offset, dst_offset, count, 0, NULL, NULL)); clFlush(cm); LOG(INFO) << "OpenCL, sync, D2D, size: " << count; } else { @@ -413,22 +441,19 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ cl_int err; cl_event dst_event; cl_event src_event; - cl_event src_event2; void* host_ptr = clEnqueueMapBuffer(src_cm, src_mem, CL_FALSE, CL_MAP_READ, src_offset, count, 0, NULL, &dst_event, &err); AMD_CHECK(err); AMD_CHECK(clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, host_ptr, 1, &dst_event, &src_event)); - AMD_CHECK(clEnqueueUnmapMemObject(src_cm, src_mem, host_ptr, 1, &src_event2, NULL)); + AMD_CHECK(clEnqueueUnmapMemObject(src_cm, src_mem, host_ptr, 1, &src_event, NULL)); clFlush(src_cm); clFlush(dst_cm); LOG(INFO) << "OpenCL, sync, P2P, size: " << count; } - } #endif - #if 0 void AMD_API::sync_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD) { @@ -436,8 +461,9 @@ void AMD_API::sync_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \ cl_mem dst_mem = dst.dmem; size_t dst_offset = dst.offset; - LOG(INFO) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" << - dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << + dst_id << " dst_office=" << + dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count; cl_event event; cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); @@ -451,18 +477,24 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ const void* src, size_t src_offset, int src_id, \ size_t count, __HtoD) { - LOG(INFO) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + + if (count == 0) { + return; + } cl_event event; cl_mem dst_mem = (cl_mem)dst; cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); - clEnqueueWriteBuffer(dst_cm, dst_mem, CL_TRUE, dst_offset, count, (char*)src + src_offset, 0, NULL, - &event); + AMD_CHECK(clEnqueueWriteBuffer( + dst_cm, dst_mem, CL_TRUE, dst_offset, count, (char*)src + src_offset, 0, NULL, &event)); clFlush(dst_cm); clWaitForEvents(1, &event); + clReleaseEvent(event); LOG(INFO) << "OpenCL, sync, H2D, size: " << count; - } #endif @@ -473,8 +505,9 @@ void AMD_API::async_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \ cl_mem dst_mem = dst.dmem; size_t dst_offset = dst.offset; - LOG(INFO) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" << - dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << + dst_id << " dst_office=" << + dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count; cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream); clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, src, 0, NULL, NULL); @@ -487,13 +520,19 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ const void* src, size_t src_offset, int src_id, \ size_t count, stream_t stream, __HtoD) { - LOG(INFO) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + + if (count == 0) { + return; + } cl_mem dst_mem = (cl_mem)dst; cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream); - clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, (char*)src + src_offset, 0, NULL, - NULL); + AMD_CHECK(clEnqueueWriteBuffer( + dst_cm, dst_mem, CL_FALSE, dst_offset, count, (char*)src + src_offset, 0, NULL, NULL)); clFlush(dst_cm); LOG(INFO) << "OpenCL, async, H2D, size: " << count; } @@ -520,21 +559,27 @@ void AMD_API::sync_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \ void AMD_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, \ size_t count, __DtoH) { - LOG(INFO) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + + if (count == 0) { + return; + } cl_event event; cl_mem src_mem = (cl_mem)src; cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(); - clEnqueueReadBuffer(src_cm, src_mem, CL_TRUE, src_offset, count, (char*) dst + dst_offset, 0, NULL, - &event); + AMD_CHECK(clEnqueueReadBuffer( + src_cm, src_mem, CL_TRUE, src_offset, count, (char*)dst + dst_offset, 0, NULL, &event)); clFlush(src_cm); clWaitForEvents(1, &event); + clReleaseEvent(event); LOG(INFO) << "OpenCL, sync, D2H, size: " << count; } #endif - #if 0 void AMD_API::async_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \ size_t count, stream_t& stream, __DtoH) { @@ -542,8 +587,9 @@ void AMD_API::async_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \ cl_mem src_mem = src.dmem; size_t src_offset = src.offset; - LOG(INFO) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " src=" << - (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << + " src=" << + (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; //cl_mem src_mem = (cl_mem) src; cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream); @@ -556,14 +602,20 @@ void AMD_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, \ size_t count, stream_t stream, __DtoH) { - LOG(INFO) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset - << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; - LOG(INFO) << __func__ << " stream: " << stream; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << + " dst_office=" << dst_offset + << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset + << " count=" << count; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " stream: " << stream; + + if (count == 0) { + return; + } cl_mem src_mem = (cl_mem)src; cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream); - clEnqueueReadBuffer(src_cm, src_mem, CL_FALSE, src_offset, count, (char*)dst + dst_offset, 0, NULL, - NULL); + AMD_CHECK(clEnqueueReadBuffer( + src_cm, src_mem, CL_FALSE, src_offset, count, (char*)dst + dst_offset, 0, NULL, NULL)); clFlush(src_cm); LOG(INFO) << "OpenCL, async, D2H, size: " << count; } @@ -598,6 +650,10 @@ void AMD_API::sync_memcpy_p2p(TPtr dst, int dst_dev, const TPtr src, \ void AMD_API::sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, size_t count) { + if (count == 0) { + return; + } + cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(); @@ -613,11 +669,11 @@ void AMD_API::sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ clFlush(src_cm); clFlush(dst_cm); clWaitForEvents(1, &event); + clReleaseEvent(event); LOG(INFO) << "OpenCL, sync, P2P, size: " << count; } #endif - #if 0 void AMD_API::async_memcpy_p2p(TPtr dst, int dst_dev, const TPtr src, \ int src_dev, size_t count, stream_t& stream) { @@ -648,6 +704,10 @@ void AMD_API::async_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ const TPtr src, size_t src_offset, int src_id, \ size_t count, stream_t stream) { + if (count == 0) { + return; + } + cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream); cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream); @@ -669,13 +729,12 @@ void AMD_API::async_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ } #endif - /** * \brief device target return currently used device id * @return currently activated device id */ int AMD_API::get_device_id() { - //LOG(INFO) << "get device id = " << current_device_id_index; + // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "get device id = " << current_device_id_index; return current_device_id_index; } @@ -721,7 +780,7 @@ cl_platform_id AMD_API::get_platform_id() { AMD_CHECK(clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, infoSize, info, NULL)); if (strstr(info, "AMD") != NULL) { - id = platformIDs[i]; + id = platformIDs[i]; errNum = CL_SUCCESS; free(info); break; @@ -795,7 +854,6 @@ void AMD_API::init(){ } */ - typedef TargetWrapper AMDH_API; typedef Env AMDH_ENV; @@ -806,7 +864,7 @@ void AMDH_API::get_device_count(int& count) { } void AMDH_API::set_device(int id) { - //todo + // todo LOG(WARNING) << "host target AMDHX86 \" set_device\" is not implemented"; } @@ -820,21 +878,18 @@ void AMDH_API::mem_free(void* ptr) { } } - void AMDH_API::mem_set(void* ptr, int value, size_t n) { memset(ptr, value, n); } void AMDH_API::create_event(event_t* event, bool flag) { - LOG(INFO) << "create_event break opencl call sequence. Is baidu expect clCreateUserEvent?"; - //do nothing for this. + // do nothing for this. *event = nullptr; - //Env::is_init(); - //cl_int err = CL_SUCCESS; - //event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err); - //AMD_CHECK(err); + // cl_int err = CL_SUCCESS; + // event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err); + // AMD_CHECK(err); } void AMDH_API::create_stream(stream_t* stream) { @@ -847,7 +902,6 @@ void AMDH_API::create_stream(stream_t* stream) { * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking */ void AMDH_API::create_stream_with_flag(stream_t* stream, unsigned int flag) { - Env::is_init(); int current_device_id_index = 0; cl_int err = CL_SUCCESS; @@ -872,10 +926,11 @@ void AMDH_API::destroy_stream(stream_t stream) { void AMDH_API::destroy_event(event_t event) { if (event == nullptr) { - // LOG(INFO) << "event is empty, do nothing"; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "event is empty, do nothing"; return; } + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << event; cl_command_type t; AMD_CHECK(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &t, NULL)); @@ -886,15 +941,12 @@ void AMDH_API::destroy_event(event_t event) { if (refs == 1) { AMD_CHECK(clSetUserEventStatus(event, CL_COMPLETE)); } - - AMD_CHECK(clReleaseEvent(event)); - } else { - // LOG(INFO) << "NOT User Event, do nothing"; } + AMD_CHECK(clReleaseEvent(event)); } -void AMDH_API::record_event(event_t event, stream_t stream) { +void AMDH_API::record_event(event_t& event, stream_t stream) { AMD_CHECK(clEnqueueMarkerWithWaitList(stream, 0, NULL, &event)); } @@ -904,30 +956,33 @@ void AMDH_API::query_event(event_t event) { } void AMDH_API::sync_event(event_t event) { - // LOG(INFO) << __func__ ; - if (event == nullptr) { - LOG(INFO) << "event is empty, do nothing"; + LOG(INFO) << __func__ << " event is empty, do nothing"; return; } - // LOG(INFO) << "sync_event E " << event; + LOG(INFO) << "sync_event E " << event; AMD_CHECK(clWaitForEvents(1, &event)); - // LOG(INFO) << "sync_event X " << event; + LOG(INFO) << "sync_event X " << event; } void AMDH_API::sync_stream(event_t event, stream_t stream) { - LOG(INFO) << __func__ ; - - if (event != nullptr) { - LOG(INFO) << "event is null"; + if (stream == NULL) { + LOG(INFO) << __func__ << " stream is empty, do nothing"; return; } - LOG(INFO) << "sync_stream E "; - AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, NULL)); + LOG(INFO) << "sync_stream E"; + event_t revent; + + if (event == NULL) { + AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 0, NULL, &revent)); + } else { + AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, &revent)); + } + clFlush(stream); - LOG(INFO) << "sync_stream D "; + LOG(INFO) << "sync_stream D"; } void AMDH_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ @@ -972,6 +1027,6 @@ template struct Env; #endif // AMD_GPU -} //namespace saber +} // namespace saber -} //namespace anakin +} // namespace anakin diff --git a/saber/core/impl/amd/utils/amd_base.h b/saber/core/impl/amd/utils/amd_base.h new file mode 100644 index 000000000..23c3fc422 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_base.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDBASE_H +#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDBASE_H +#include +#include +#include "amd_logger.h" +#include + +namespace anakin { +namespace saber { + +typedef struct ExtSolutionConfigTpye { + int in_tile0, in_tile1; + int grp_tile0, grp_tile1; + int out_pix_tile0, out_pix_tile1; + int n_stacks; + int n_out_pix_tiles; + int n_out_tiles_perstack; + int n_in_data_tiles; + int n_read_procs; + int alu_tile0, alu_tile1; + int horiz_out_pix; + int vert_out_pix; +} T_ExtSolutionConfig; + +enum KernelType { + DEFAULT = 0, + SABER = DEFAULT, + SOURCE, + MIOPEN, +}; + +struct KernelInfo { + std::string comp_options; + int wk_dim = 1; + std::vector l_wk; + std::vector g_wk; + std::vector g_wk_offset; + std::string kernel_file; + std::string kernel_name; + KernelType kernel_type = DEFAULT; + friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k); + + KernelInfo& operator=(KernelInfo right) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo"; + clone(&right); + }; + KernelInfo& operator=(KernelInfo* right) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo *"; + clone(right); + }; + KernelInfo& operator=(miopen::solver::KernelInfo& right) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo *"; + clone(&right); + }; + + void printE() { + std::string s = ""; + + LOG(INFO) << "======================================================================"; + + if (kernel_type == KernelType::SOURCE) { + LOG(INFO) << "kernel is cl string"; + } else { + LOG(INFO) << "kernel file name: " << kernel_file; + } + + LOG(INFO) << "kernel name: " << kernel_name; + + for (auto v : l_wk) { + s += std::to_string(v); + s += " "; + } + + LOG(INFO) << "local work size: " << s; + s = ""; + + for (auto v : g_wk) { + s += std::to_string(v); + s += " "; + } + + LOG(INFO) << "global work size: " << s; + LOG(INFO) << "compile option: " << comp_options; + LOG(INFO) << "kernel type: " << kernel_type; + LOG(INFO) << "======================================================================"; + }; + + void print() { +#ifdef ENABLE_DEBUG + printE(); +#endif + }; + +private: + template + void copy_vector(std::vector* l, std::vector* r) { + l->clear(); + typename std::vector::iterator i; + + for (i = r->begin(); i < r->end(); i++) { + l->push_back(*i); + } + } + void clone(KernelInfo* ki) { + comp_options.assign(ki->comp_options); + wk_dim = ki->wk_dim; + copy_vector(&l_wk, &ki->l_wk); + copy_vector(&g_wk, &ki->g_wk); + copy_vector(&g_wk_offset, &ki->g_wk_offset); + kernel_file.assign(ki->kernel_file); + kernel_name.assign(ki->kernel_name); + kernel_type = ki->kernel_type; + } + + void clone(miopen::solver::KernelInfo* ki) { + comp_options.assign(ki->comp_options); + wk_dim = ki->g_wk.size(); + copy_vector(&l_wk, &ki->l_wk); + copy_vector(&g_wk, &ki->g_wk); + //copy_vector(&g_wk_offset, &ki->g_wk_offset); + kernel_file.assign(ki->kernel_file); + kernel_name.assign(ki->kernel_name); + kernel_type = MIOPEN; + } +}; + +} // namespace saber +} // namespace anakin +#endif diff --git a/saber/core/impl/amd/utils/amd_cache.h b/saber/core/impl/amd/utils/amd_cache.h new file mode 100644 index 000000000..2ab2f3dcc --- /dev/null +++ b/saber/core/impl/amd/utils/amd_cache.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDCACHE_H +#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDCACHE_H +#include +#include +#include +#include +#include "ocl/ocl_kernel.h" +#include "anakin_config.h" + +namespace anakin { +namespace saber { + +template +class AMDCache { +public: + typedef std::map MType; + + static AMDCache* getInstance() { + if (_instance.get() == NULL) { + _instance = std::unique_ptr>(new AMDCache); + } + + return _instance.get(); + } + + size_t getSize() { + std::lock_guard lock(m_mutex); + return cache_size; + } + + // lookup the binary given the file name + T lookup(U key) { + std::lock_guard lock(m_mutex); + typename MType::iterator iter; + iter = cache_map.find(key); + + if (iter != cache_map.end()) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] found cache!"; + return iter->second; + } else { + return NULL; + } + } + + // add program to the cache + void add(U key, T t) { + std::lock_guard lock(m_mutex); + + if (!lookup(key)) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] insert cache to slot, size: " + << cache_size; + cache_size++; + cache_map.insert(typename MType::value_type(key, t)); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] insert cache to slot done"; + } + } + + // The presumed watermark for the cache volume (256MB). Is it enough? + // We may need more delicate algorithms when necessary later. + // Right now, let's just leave it along. + static const unsigned MAX_CACHE_SIZE = 1024; + + ~AMDCache() { + release(); + } + + AMDCache() { + cache_size = 0; + cache_map.clear(); + }; + +protected: + void release() { + cache_size = 0; + cache_map.clear(); + } + +private: + static typename std::unique_ptr> _instance; + MType cache_map; + unsigned int cache_size; + std::recursive_mutex m_mutex; +}; + +template +std::unique_ptr> AMDCache::_instance = NULL; + +using ProgramCache = AMDCache>; +using KernelCache = AMDCache>; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/impl/amd/cl/ReluUni.cl b/saber/core/impl/amd/utils/amd_common.h similarity index 65% rename from saber/funcs/impl/amd/cl/ReluUni.cl rename to saber/core/impl/amd/utils/amd_common.h index 5656b0155..5ab91c220 100644 --- a/saber/funcs/impl/amd/cl/ReluUni.cl +++ b/saber/core/impl/amd/utils/amd_common.h @@ -12,11 +12,18 @@ See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ANAKIN_SABER_CORE_IMPL_AMD_AMDCOMMON_H +#define ANAKIN_SABER_CORE_IMPL_AMD_AMDCOMMON_H -__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void -ReluUni(const __global float* __restrict in, - __global float* __restrict out, - float slope) -{ - out[get_global_id(0)] = in[get_global_id(0)] * (in[get_global_id(0)] > 0.0f ? 1.0f : slope); -} +#include "anakin_config.h" +#include "saber/core/common.h" +namespace anakin { +namespace saber { +#ifdef USE_OPENCL +typedef cl_command_queue AMDStream_t; +typedef cl_event AMDEvent_t; +#endif + +} // namespace saber +} // namespace anakin +#endif diff --git a/saber/core/impl/amd/utils/amd_error.h b/saber/core/impl/amd/utils/amd_error.h new file mode 100644 index 000000000..a6efd28f0 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_error.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDERROR_H +#define ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDERROR_H + +#include +#include +#include +#include + +namespace anakin { +namespace saber { + +struct Exception : std::exception { + std::string message; + int status; + Exception(const std::string& msg = "") : message(msg), status(-1) {} + + Exception(int s, const std::string& msg = "") : message(msg), status(s) {} + + Exception SetMessage(const std::string& file, int line) { + message = file + ":" + std::to_string(line) + ": " + message; + return *this; + } + + const char* what() const noexcept override { + return message.c_str(); + } +}; + +#define AMD_THROW(...) throw anakin::saber::Exception(__VA_ARGS__).SetMessage(__FILE__, __LINE__) + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/core/impl/amd/utils/amd_file_utils.cpp b/saber/core/impl/amd/utils/amd_file_utils.cpp new file mode 100644 index 000000000..3e33041a8 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_file_utils.cpp @@ -0,0 +1,435 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "amd_file_utils.h" +#include +#include +#include +#include +#include +#include "amd_logger.h" +#include "amd_error.h" +#include +#include +#include +#include + +namespace anakin { +namespace saber { + +#define SABER_CACHE_DIR "~/.cache/amd_saber/" +#define separator "/" + +std::string md5(std::string s) { + std::array result {}; + MD5(reinterpret_cast(s.data()), s.length(), result.data()); + + std::ostringstream sout; + sout << std::hex << std::setfill('0'); + + for (auto c : result) + sout << std::setw(2) << int {c}; + + return sout.str(); +} + +inline std::string +ReplaceString(std::string subject, const std::string& search, const std::string& replace) { + size_t pos = 0; + + while ((pos = subject.find(search, pos)) != std::string::npos) { + subject.replace(pos, search.length(), replace); + pos += replace.length(); + } + + return subject; +} + +bool is_directory_separator(char c) { + return c == '/'; +} + +bool is_root_separator(const char* str, size_t pos) { + // pos is position of the separator + if (str != nullptr && !is_directory_separator(str[pos])) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "precondition violation"; + } + + // subsequent logic expects pos to be for leftmost slash of a set + while (pos > 0 && is_directory_separator(str[pos - 1])) { + --pos; + } + + // "/" [...] + if (pos == 0) { + return true; + } + + // "//" name "/" + if (pos < 3 || !is_directory_separator(str[0]) || !is_directory_separator(str[1])) { + return false; + } + + std::string tmp(str); + return tmp.find_first_of("/", 2) == pos; +} + +size_t root_directory_start(const char* path, size_t size) { + // return npos if no root_directory found + // case "//" + if (size == 2 && is_directory_separator(path[0]) && is_directory_separator(path[1])) { + return std::string::npos; + } + + // case "//net {/}" + if (size > 3 && is_directory_separator(path[0]) && is_directory_separator(path[1]) + && !is_directory_separator(path[2])) { + std::string str(path); + size_t pos = str.find_first_of("/", 2); + return pos < size ? pos : std::string::npos; + } + + // case "/" + if (size > 0 && is_directory_separator(path[0])) { + return 0; + } + + return std::string::npos; +} + +size_t filename_pos(const char* str, size_t end_pos) { + // end_pos is past-the-end position + // return 0 if str itself is filename (or empty) + + // case: "//" + if (end_pos == 2 && is_directory_separator(str[0]) && is_directory_separator(str[1])) { + return 0; + } + + // case: ends in "/" + if (end_pos && is_directory_separator(str[end_pos - 1])) { + return end_pos - 1; + } + + // set pos to start of last element + std::string filename(str); + size_t pos = (filename.find_last_of('/', end_pos - 1)); + + return (pos == std::string::npos // path itself must be a filename (or empty) + || (pos == 1 && is_directory_separator(str[0]))) // or net + ? 0 // so filename is entire string + : pos + 1; // or starts after delimiter +} + +size_t parent_path_end(const std::string& path) { + size_t end_pos = filename_pos(path.c_str(), path.length()); + + bool filename_was_separator = path.length() && is_directory_separator(path.c_str()[end_pos]); + + // skip separators unless root directory + size_t root_dir_pos = root_directory_start(path.c_str(), end_pos); + + for (; end_pos > 0 && (end_pos - 1) != root_dir_pos + && is_directory_separator(path.c_str()[end_pos - 1]); + --end_pos) { + } + + return (end_pos == 1 && root_dir_pos == 0 && filename_was_separator) ? std::string::npos + : end_pos; +} + +void SplitString(const std::string& s, std::vector& v, const std::string& c) { + std::string::size_type pos1, pos2; + pos2 = s.find(c); + pos1 = 0; + + while (std::string::npos != pos2) { + v.push_back(s.substr(pos1, pos2 - pos1)); + + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + + if (pos1 != s.length()) { + v.push_back(s.substr(pos1)); + } +} + +std::string temp_directory_path() { + const char* val = 0; + (val = std::getenv("TMPDIR")) || (val = std::getenv("TMP")) || (val = std::getenv("TEMP")) + || (val = std::getenv("TEMPDIR")); + + const char* default_tmp = "/tmp"; + std::string temp_directory((val != 0) ? val : default_tmp); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "temp directory path :" << temp_directory; + return temp_directory; +} + +bool is_directory(const std::string& path) { + struct stat info; + + if (stat(path.c_str(), &info) == 0) { + if (info.st_mode & S_IFDIR) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "it's a directory"; + return true; + } else if (info.st_mode & S_IFREG) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "it's a file"; + return false; + } else { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "not file or directory"; + } + } else { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path not exist"; + } + + return false; +} + +std::string filename(const std::string& filename) { + size_t pos = filename_pos(filename.c_str(), filename.length()); + return (filename.length() && pos && is_directory_separator(filename.c_str()[pos]) + && !is_root_separator(filename.c_str(), pos)) + ? "." + : filename.substr(pos, filename.length()); +} + +std::string remove_filename(std::string path) { + return path.erase(parent_path_end(path), path.length()); +} + +int permissions(std::string& p, mode_t mode) { + int result = chmod(p.c_str(), mode); + + if (result != 0) { + result = errno; + } + + return (result); +} + +std::string unique_path() { + int unique_path_len = 19; + std::string str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + std::string newstr; + int pos, i = 0; + std::string str1 = "-"; + + while (newstr.size() != unique_path_len) { + pos = ((rand() % (str.size() - 1))); + newstr += str.substr(pos, 1); + i++; + + if (i == 4 && newstr.size() != unique_path_len) { + newstr += "-"; + i = 0; + } + } + + return newstr; +} + +bool exists(std::string path) { + int state = access(path.c_str(), R_OK | W_OK); + + if (state == 0) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "file path exist"; + return true; + } else { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "file path not exist, path :" << path; + return false; + } +} + +std::string parent_path(const std::string& path) { + size_t end_pos = parent_path_end(path); + return end_pos == std::string::npos ? path : path.substr(0, end_pos); +} + +int file_status(const std::string& path) { + struct stat info; + + if (stat(path.c_str(), &info) == 0) { + if (info.st_mode & S_IFDIR) { + return 1; // it's a directory + } else if (info.st_mode & S_IFREG) { + return 2; // it's a file + } else { + return 3; // not direcort or file + } + } else { + return 3; + } +} + +bool filename_is_dot(std::string p) { + std::string name = filename(p); + return name.length() == 1 && *name.c_str() == '.'; +} + +bool filename_is_dot_dot(std::string p) { + return p.length() >= 2 && p.c_str()[p.length() - 1] == '.' && p.c_str()[p.length() - 2] == '.'; +} + +bool create_directory(const std::string& p) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path :" << p; + int flag = mkdir(p.c_str(), S_IRWXU | S_IRWXG | S_IRWXO); + + if (flag == 0) { + return true; + } + + // attempt to create directory failed + if (is_directory(p)) { + return false; + } + + // attempt to create directory failed && it doesn't already exist + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << + "attempt to create directory failed && it doesn't already exist"; + return false; +} + +bool create_directories(const std::string p) { + if (p.length() == 0) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "invalid argument for path"; + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path :" << p; + + if (filename_is_dot(p) || filename_is_dot_dot(p)) { + return create_directories(parent_path(p)); + } + + if (is_directory(p)) { + return false; + } + + std::string parent = parent_path(p); + + if (parent == p) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "internal error: path == parent path"; + } + + if (parent.length() > 0) { + // if the parent does not exist, create the parent + if (!is_directory(p)) { + create_directories(parent); + } + } + + // create the directory + return create_directory(p); +} + +// original functions +std::string ComputeCachePath() { + std::string cache_dir = SABER_CACHE_DIR; + + auto p = ReplaceString(cache_dir, "~", getenv("HOME")); + + if (!exists(p)) { + create_directories(p); + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Get program path :" << p; + + return p; +} + +std::string GetCachePath() { + static const std::string path = ComputeCachePath(); + return path; +} + +std::string +GetCacheFile(const std::string& device, const std::string& name, const std::string& args) { + std::string filename = name + ".o"; + return GetCachePath() + md5(device + ":" + args) + "/" + filename; +} + +std::string +LoadBinaryPath(const std::string& device, const std::string& name, const std::string& args) { + auto f = GetCacheFile(device, name, args); + + if (exists(f)) { + return f; + } else { + return {}; + } +} + +void SaveBinary( + const std::string& binary_path, + const std::string& device, + const std::string& name, + const std::string& args) { + auto p = GetCacheFile(device, name, args); + create_directories(parent_path(p)); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "target path :" << p; + rename(binary_path.c_str(), p.c_str()); +} + +std::string LoadFile(const std::string& s) { + std::ifstream t(s); + std::stringstream buffer; + buffer << t.rdbuf(); + return buffer.str(); +} + +miopen::Db GetDb(std::string device_name, int max_CU) { + auto p = ReplaceString(miopen::GetDbPath(), "~", getenv("HOME")); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "db path :" << p; + + if (!exists(p)) { + create_directories(p); + } + + std::string dbFileName = p + "/" + device_name + "_" + std::to_string(max_CU) + ".cd.pad.txt"; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "db file name :" << dbFileName; + return {dbFileName}; +} + +std::string genTempFilePath(std::string name) { + std::string dir_path; + std::string file_path; + dir_path = + (/*boost::filesystem::*/ temp_directory_path() + "/" + + /*boost::filesystem::*/ unique_path(/*"amd-tmp-%%%%-%%%%-%%%%-%%%%"*/)); + + create_directories(dir_path); + + file_path = dir_path + "/" + name; //(dir_path / name).string(); + + if (!std::ofstream {file_path, std::ios_base::out | std::ios_base::in | std::ios_base::trunc} + .good()) { + AMD_THROW("Failed to create temp file: " + file_path); + } + return file_path; +} + +void writeFile(const std::string& content, const std::string& name) { + FILE* fd = std::fopen(name.c_str(), "w"); + + if (std::fwrite(content.c_str(), 1, content.size(), fd) != content.size()) { + std::fclose(fd); + AMD_THROW("Failed to write to src file"); + } + + std::fclose(fd); +} + +} // namespace saber +} // namespace anakin diff --git a/saber/core/impl/amd/utils/amd_file_utils.h b/saber/core/impl/amd/utils/amd_file_utils.h new file mode 100644 index 000000000..2d2e6bdb9 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_file_utils.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_UTILS_AMDFILEUTILS_H +#define ANAKIN_SABER_FUNCS_IMPL_UTILS_AMDFILEUTILS_H + +#include +#include +#include +#include +#include +#include +#include + +namespace anakin { +namespace saber { + +std::string temp_directory_path(); +bool is_directory(const std::string& path); +std::string filename(const std::string& filename); +std::string remove_filename(std::string path); +int permissions(std::string& p, mode_t mode); +std::string unique_path(); +bool exists(std::string path); +std::string parent_path(const std::string& path); +bool create_directories(std::string path); + +std::string +GetCacheFile(const std::string& device, const std::string& name, const std::string& args); + +std::string GetCachePath(); + +void SaveBinary( + const std::string& binary_path, + const std::string& device, + const std::string& name, + const std::string& args); + +std::string +LoadBinaryPath(const std::string& device, const std::string& name, const std::string& args); + +std::string LoadFile(const std::string& s); +std::string md5(std::string s); +miopen::Db GetDb(std::string device_name, int max_CU); +std::string genTempFilePath(std::string name); +void writeFile(const std::string& content, const std::string& name); + +} // namespace saber +} // namespace anakin +#endif diff --git a/saber/core/impl/amd/utils/amd_kernel.cpp b/saber/core/impl/amd/utils/amd_kernel.cpp new file mode 100644 index 000000000..424dceda8 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_kernel.cpp @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "amd_kernel.h" +#include "saber/core/env.h" +#include "saber/core/device.h" + +namespace anakin { +namespace saber { + +#ifdef USE_OPENCL +AMDKernelPtr CreateKernel(int device_id, KernelInfo* ki) { + cl_context context = 0; + cl_device_id device = 0; + + Device dev = Env::cur_env()[device_id]; // anakin device id to AMD device + device = dev.get_device(); + context = dev.get_context(); + + return gen_shared_ocl(new OCLKernel(context, device, ki)); +} + +bool LaunchKernel(AMDStream_t stream, amd_kernel_list kernels, bool sync) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__; + + float exec_time_ms = 0; + + for (amd_kernel_list::iterator it = kernels.begin(); it != kernels.end(); it++) { + cl_event event; + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " E"; + + if (!it->get()->Invoke(stream, 0, NULL, (sync ? &event : NULL))) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " Failed"; + return false; + } + + if (sync) { + TargetWrapper::sync_event(event); +#ifdef ENABLE_AMD_PROFILING + cl_ulong start, end; + clGetEventProfilingInfo( + event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + exec_time_ms = (end - start) * 1e-6; +#endif + } + + if (sync) { + TargetWrapper::destroy_event(event); + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " X : " << exec_time_ms << " ms"; + } + + return true; +} +#endif + +} // namespace saber +} // namespace anakin diff --git a/saber/core/impl/amd/utils/amd_kernel.h b/saber/core/impl/amd/utils/amd_kernel.h new file mode 100644 index 000000000..28aefc3f2 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_CORE_IMPL_AMD_AMDKERNEL_H +#define ANAKIN_SABER_CORE_IMPL_AMD_AMDKERNEL_H + +#include "amd_base.h" +#include "amd_common.h" + +#ifdef USE_OPENCL +#include "ocl/ocl_kernel.h" +#endif + +namespace anakin { +namespace saber { + +#ifdef USE_OPENCL +typedef OCLKernel AMDKernel; +typedef OCLKernelPtr AMDKernelPtr; +typedef std::list amd_kernel_list; +#endif + +extern AMDKernelPtr CreateKernel(int device_id, KernelInfo* ki); +extern bool LaunchKernel(AMDStream_t stream, amd_kernel_list kernels, bool sync = false); + +} // namespace saber +} // namespace anakin +#endif diff --git a/saber/core/impl/amd/utils/amd_kernels.cpp.in b/saber/core/impl/amd/utils/amd_kernels.cpp.in new file mode 100644 index 000000000..9be1aa4c0 --- /dev/null +++ b/saber/core/impl/amd/utils/amd_kernels.cpp.in @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "amd_kernels.h" +#include "utils/logger/logger.h" +#include +#include +#include + +namespace anakin { +namespace saber { + +const std::map& kernels() { + static const std::map data {${INIT_KERNELS}}; + return data; +} + +std::string GetKernelSrc(std::string name) { + // Use the base name of the string + int start = 0; + auto slash = name.find_last_of("/\\"); + + if (slash != std::string::npos) { + start = slash + 1; + } + + int len = name.size(); + auto ex = name.rfind('.'); + + if (ex != std::string::npos) { + len = ex - start; + } + + auto key = name.substr(start, len); + // Convert to uppercase + std::transform(key.begin(), key.end(), key.begin(), ::toupper); + + auto it = kernels().find(key); + + if (it == kernels().end()) { + LOG(ERROR) << "Failed to load kernel source: " << key; + } + + return it->second; +} +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl b/saber/core/impl/amd/utils/amd_logger.h similarity index 57% rename from saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl rename to saber/core/impl/amd/utils/amd_logger.h index 676f0df07..a5535ec6a 100644 --- a/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl +++ b/saber/core/impl/amd/utils/amd_logger.h @@ -12,14 +12,22 @@ See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H +#define ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H -__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void -MIOpenReLu(const __global float* __restrict in, - __global float* __restrict out, - __global float* bias, - float slope, int N, int C, int H, int W) -{ - int gid_x = get_global_id(0); - float intermediate = in[gid_x] + bias[(gid_x % (C * H * W)) / (H * W)]; - out[gid_x] = intermediate * (intermediate > 0.0f ? 1.0f : slope); -} +#include "anakin_config.h" +#include "utils/logger/logger.h" + +namespace anakin { +namespace saber { +//#define AMD_ENABLE_LOG + +#if defined(ENABLE_DEBUG) || defined(ENABLE_AMD_DEBUG) +#define ENABLE_AMD_DEBUG_LOG 1 +#else +#define ENABLE_AMD_DEBUG_LOG 0 +#endif + +} // namespace saber +} // namespace anakin +#endif // ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H diff --git a/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp b/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp new file mode 100644 index 000000000..95faabf40 --- /dev/null +++ b/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp @@ -0,0 +1,437 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "ocl_kernel.h" +#include "amd_cache.h" +#include "amd_file_utils.h" +#include "amd_logger.h" + +#include +#include +#include +#include + +namespace anakin { +namespace saber { +//#define ENABLE_LOG +#define SAVE_TO_FILE +#define MAX_LOG_LENGTH 65535 +extern std::string GetKernelSrc(std::string name); + +std::string GetDeviceName(cl_device_id device_id) { + + char deviceName[100]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Device Name: " << deviceName; + return std::string(deviceName); +} + +#if 1 + +void SaveProgramBinary(const cl_program program, const std::string& name) { + size_t binary_size; + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, nullptr); + + std::vector binary(binary_size); + char* src[1] = {binary.data()}; + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(src), &src, nullptr); + + std::ofstream fout(name.c_str(), std::ios::out | std::ios::binary); + fout.write(binary.data(), binary.size()); + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "save program to cache file: " << name; +} + +void WriteProgramToFile(cl_program cl_prg, cl_device_id device_id, KernelInfo* ki) { +#ifndef SAVE_TO_FILE + + if (true) { + return; + } + +#endif + + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return; + } else { + kernelKey = ki->kernel_file; + } + + std::string deviceName = GetDeviceName(device_id); + auto path = GetCachePath() + unique_path(); + SaveProgramBinary(cl_prg, path); + SaveBinary(path, deviceName, kernelKey, ki->comp_options); +} + +cl_program LoadBinaryProgram( + cl_context context, + cl_device_id device_id, + const char* source_data, + size_t size) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__; + cl_int errNum; + cl_program program = clCreateProgramWithBinary( + context, 1, &device_id, &size, (const unsigned char**)&source_data, NULL, &errNum); + + if (errNum != CL_SUCCESS) { + LOG(ERROR) << __func__ << " error(" << errNum << ")"; + } + + return program; +} + +cl_program LoadProgramFromFileCache(cl_context context, cl_device_id device_id, KernelInfo* ki) { +#ifndef SAVE_TO_FILE + + if (true) { + return NULL; + } + +#endif + + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return NULL; + } else { + kernelKey = ki->kernel_file; + } + + std::string deviceName = GetDeviceName(device_id); + + std::string cacheFilePath = LoadBinaryPath(deviceName.c_str(), kernelKey, ki->comp_options); + + if (!cacheFilePath.empty()) { + std::string source = LoadFile(cacheFilePath); + + cl_program program = LoadBinaryProgram(context, device_id, source.data(), source.size()); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Create CL program from cache file: " << kernelKey; + return program; + } + + return NULL; +} + +ClProgramPtr LoadProgramFromMemCache(cl_context context, KernelInfo* ki) { + ProgramCache* programCache = ProgramCache::getInstance(); + + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return NULL; + } else { + kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name); + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey; + + std::string progKey(kernelKey); + // Consider different compile options for single program + progKey += ki->comp_options; + + // get program from program cache + auto program_ptr = programCache->lookup(std::pair(context, progKey)); + + if (program_ptr != NULL && program_ptr.get() != NULL) { + return program_ptr; + } + + return NULL; +} + +void WriteProgramIntoMemCache(cl_context context, ClProgramPtr program, KernelInfo* ki) { + ProgramCache* programCache = ProgramCache::getInstance(); + + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return NULL; + } else { + kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name); + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey; + + std::string progKey(kernelKey); + // Consider different compile options for single program + progKey += ki->comp_options; + + // Save to Program Cache + if (programCache->getSize() < programCache->MAX_CACHE_SIZE) { + programCache->add(std::pair(context, progKey), program); + } else { + LOG(INFO) << "Warning: program code cache has been full.\n"; + } +} +bool BuildProgram(cl_program program, cl_device_id device_id, KernelInfo* ki) { + + if (program == NULL) { + LOG(ERROR) << "Failed to Build Program, cl_program is not initialized."; + return false; + } + + cl_int errNum; + auto is_asm = miopen::EndsWith(ki->kernel_file, ".s"); + + if (is_asm) { + clBuildProgram(program, 1, &device_id, "", NULL, NULL); + } else { + errNum = clBuildProgram(program, 1, &device_id, ki->comp_options.c_str(), NULL, NULL); + } + + if (errNum != CL_SUCCESS) { + char buildErrLog[MAX_LOG_LENGTH]; + clGetProgramBuildInfo( + program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buildErrLog), buildErrLog, NULL); + + LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog; + return true; + } + + return true; +} + +cl_program CreateProgramFromSource(cl_context context, cl_device_id device_id, KernelInfo* ki) { + + cl_program program = NULL; + bool is_binary = false; + std::string source; + + if (ki->kernel_type == SOURCE) { + source = ki->kernel_file; + } else { + try { + if (ki->kernel_type == MIOPEN) { + source = miopen::GetKernelSrc(ki->kernel_file); + } else { + source = GetKernelSrc(ki->kernel_file); + } + } catch (...) { + LOG(ERROR) << "Can't Load CL Program"; + return NULL; + } + + if (miopen::EndsWith(ki->kernel_file, ".s") || miopen::EndsWith(ki->kernel_file, ".so")) { + is_binary = true; + } + + if (!source.empty()) { + auto is_asm = miopen::EndsWith(ki->kernel_file, ".s"); + + if (is_asm) { + std::string deviceName = GetDeviceName(device_id); + AmdgcnAssemble( + source, std::string(" -mcpu=") + deviceName + " " + ki->comp_options); + } + } else { + std::ifstream kFile(ki->kernel_file, std::ios::in); + + if (!kFile.is_open()) { + LOG(ERROR) << "Failed to open file for reading: " << ki->kernel_file; + return NULL; + } + + source = std::string( + (std::istreambuf_iterator(kFile)), std::istreambuf_iterator()); + kFile.close(); + } + } + + if (source.empty()) { + return NULL; + } + + if (is_binary) { + program = LoadBinaryProgram(context, device_id, source.data(), source.size()); + } else { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "createPrograWithSource"; + std::string params = ki->comp_options; +#if defined(ENABLE_DEBUG) || defined(ENABLE_LOG) + params += " -Werror"; +#ifdef __linux__ + params += miopen::KernelWarningsString(); +#endif +#endif + params += " -cl-std=CL1.2"; + + // load program from header + const char* srcStr = source.data(); + size_t size = source.size(); + + program = clCreateProgramWithSource(context, 1, (const char**)&srcStr, &size, NULL); + + if (program == NULL) { + LOG(ERROR) << "Failed to create CL program from header: " << ki->kernel_file; + return NULL; + } + } + + return program; +} + +#if 0 +ClKernelPtr LoadKernelFromMemCache(cl_program program, KernelInfo* ki) { + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return NULL; + } else { + kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name); + } + + KernelCache* kernelCache = KernelCache::getInstance(); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey; + auto kernel_ptr = kernelCache->lookup(std::pair(program, kernelKey)); + + if (kernel_ptr != NULL && kernel_ptr.get() != NULL) { + return kernel_ptr; + } + + return NULL; +} + +void WriteKernelIntoMemCache(cl_program program, ClKernelPtr kernel, KernelInfo* ki) { + std::string kernelKey; + + if (ki->kernel_type == SOURCE) { + return NULL; + } else { + kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name); + } + + KernelCache* kernelCache = KernelCache::getInstance(); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey; + + if (kernelCache->getSize() < kernelCache->MAX_CACHE_SIZE) { + kernelCache->add(std::pair(program, kernelKey), kernel); + } else { + LOG(INFO) << "Warning: kernel cache has been full.\n"; + } +} +#endif + +cl_kernel CreateKernelFromSource(cl_program program, KernelInfo* ki) { + cl_kernel kernel = NULL; + + kernel = clCreateKernel(program, ki->kernel_name.c_str(), NULL); + + if (kernel == NULL) { + LOG(ERROR) << "error: failed to create CL kernel.\n"; + return NULL; + } + + return kernel; +} + +ClProgramPtr CreateProgram(cl_context context, cl_device_id device_id, KernelInfo* kernel_info) { + + ClProgramPtr program_ptr = LoadProgramFromMemCache(context, kernel_info); + + if (program_ptr != NULL) { + return program_ptr; + } + + bool is_load_from_file = true; + cl_program program = LoadProgramFromFileCache(context, device_id, kernel_info); + + if (program == NULL) { + program = CreateProgramFromSource(context, device_id, kernel_info); + is_load_from_file = false; + } + + if (!BuildProgram(program, device_id, kernel_info)) { + program = NULL; + return NULL; + } + + program_ptr = gen_shared_cl_program(program); + WriteProgramIntoMemCache(context, program_ptr, kernel_info); + + if (!is_load_from_file) { + WriteProgramToFile(program, device_id, kernel_info); + } + + return program_ptr; +} + +ClKernelPtr CreateKernel(cl_program program, KernelInfo* kernel_info) { + +#if 0 + //Kernel cache cannot work at multi-thread scenario + ClKernelPtr kernel_ptr = LoadKernelFromMemCache(program, kernel_info); + + if (kernel_ptr != NULL) { + return kernel_ptr; + } + +#endif + + cl_kernel kernel = CreateKernelFromSource(program, kernel_info); + + if (kernel == NULL) { + return NULL; + } + + ClKernelPtr kernel_ptr = gen_shared_cl_kernel(kernel); + //WriteKernelIntoMemCache(program, kernel_ptr, kernel_info); + + return kernel_ptr; +} + +#endif +#if 1 +void OCLKernel::CreateProgram() { + _program = anakin::saber::CreateProgram(_context, _device_id, &kernel_info); +} + +void OCLKernel::CreateKernel() { + _kernel = anakin::saber::CreateKernel(_program.get(), &kernel_info); +} + +bool OCLKernel::run( + cl_command_queue cm, + int wait_events_num, + const cl_event* wait_events, + cl_event* event) { + + cl_int errNum = clEnqueueNDRangeKernel( + cm, + _kernel.get(), + kernel_info.wk_dim, + (kernel_info.g_wk_offset.size() > 0 ? kernel_info.g_wk_offset.data() : NULL), + kernel_info.g_wk.data(), + kernel_info.l_wk.data(), + wait_events_num, + wait_events, + event); + + if (errNum != CL_SUCCESS) { + LOG(ERROR) << "Fail to set execution: " << errNum; + kernel_info.printE(); + return false; + } + + return true; +} + +std::string OCLKernel::GetName() { + return kernel_info.kernel_name; +} + +#endif + +} // namespace saber +} // namespace anakin diff --git a/saber/core/impl/amd/utils/ocl/ocl_kernel.h b/saber/core/impl/amd/utils/ocl/ocl_kernel.h new file mode 100644 index 000000000..8b09f7b24 --- /dev/null +++ b/saber/core/impl/amd/utils/ocl/ocl_kernel.h @@ -0,0 +1,164 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_OCLKERNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_OCLKERNEL_H + +#include "shared_pointer.h" +#include "amd_base.h" +#include +#include + +namespace anakin { +namespace saber { + +using ClProgramPtr = SHARED_OBJ(cl_program); +#define gen_shared_cl_program(t) GEN_SHARED_OBJ_WITH_DELETER(cl_program, clReleaseProgram, t) + +using ClKernelPtr = SHARED_OBJ(cl_kernel); +#define gen_shared_cl_kernel(t) GEN_SHARED_OBJ_WITH_DELETER(cl_kernel, clReleaseKernel, t) + +class OCLKernel { +public: + OCLKernel(cl_context ctx, cl_device_id id, anakin::saber::KernelInfo* ki) : + _context(ctx), + _device_id(id) { + if (ki == nullptr) { + throw "KernelInfo can't be null"; + } + + kernel_info = ki; + kernel_info.print(); + + CreateProgram(); + CreateKernel(); + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "create kernel complete"; + } + + bool isInit() { + return _kernel == NULL ? false : true; + } + + cl_program getProgram() { + return _program.get(); + }; + cl_kernel getKernel() { + return _kernel.get(); + }; + template + bool SetKernelArgs(const Ts& ... xs) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__; + + if (!isInit()) { + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init"; + + if (!setKernelArgs(0, xs...)) { + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args complete"; + return true; + } + + bool + Invoke(cl_command_queue cm, int wait_events_num, const cl_event* wait_events, cl_event* event) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__; + + if (!isInit()) { + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init"; + return run(cm, wait_events_num, wait_events, event); + } + + template + bool + Invoke(cl_command_queue cm, + int wait_events_num, + const cl_event* wait_events, + cl_event* event, + const Ts& ... xs) { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__; + + if (!isInit()) { + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init"; + + if (!setKernelArgs(0, xs...)) { + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args complete"; + + return run(cm, wait_events_num, wait_events, event); + } + + std::string GetName(); + + ~OCLKernel() {} + +private: + template + bool setKernelArgs(int index, const T& x) { + + cl_int errNum = clSetKernelArg(_kernel.get(), index, sizeof(T), &x); + + if (errNum != CL_SUCCESS) { + + if (errNum == CL_INVALID_ARG_INDEX) { // workaround for miopengemm kenrel + LOG(ERROR) << "set kernel args[" << index << "] err = " << errNum; + return true; + } + + LOG(ERROR) << "set kernel args[" << index << "] err = " << errNum; + return false; + } + + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args[" << index << "]"; + return true; + } + template + bool setKernelArgs(int index, const T& x, const Ts& ... xs) { + if (!setKernelArgs(index, x)) { + return false; + } + + return setKernelArgs(index + 1, xs...); + } + + bool + run(cl_command_queue cm, int wait_events_num, const cl_event* wait_events, cl_event* event); + + void CreateProgram(); + void CreateKernel(); + + ClProgramPtr _program; + ClKernelPtr _kernel; + cl_context _context; + cl_device_id _device_id; + KernelInfo kernel_info; +}; + +using OCLKernelPtr = SHARED_OBJ(OCLKernel); +#define gen_shared_ocl(t) GEN_SHARED_OBJ(OCLKernel, t) + +} // namespace saber +} // namespace anakin +#endif diff --git a/saber/core/impl/amd/utils/shared_pointer.h b/saber/core/impl/amd/utils/shared_pointer.h new file mode 100644 index 000000000..1411cf795 --- /dev/null +++ b/saber/core/impl/amd/utils/shared_pointer.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_SHAREDPOINTER_H +#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_SHAREDPOINTER_H +#include +#include +#include "anakin_config.h" +#include "amd_logger.h" + +namespace anakin { +namespace saber { + +template +struct SharedDeleter { + template + void operator()(T* t) const { + if (t != NULL) { + try { + LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "release shared object :" << typeid(t).name(); + f(t); + t = NULL; + } catch (...) { + LOG(ERROR) << "Catught error for release shared object : " << typeid(t).name(); + } + } + } +}; + +#define SHARED_OBJ(T) std::shared_ptr::type> + +#define GEN_SHARED_OBJ_WITH_DELETER(T, F, t) \ + std::shared_ptr::type>( \ + t, anakin::saber::SharedDeleter()) + +#define GEN_SHARED_OBJ(T, t) std::shared_ptr::type>(t) + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 49df342ce..f0b59868a 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -18,6 +18,9 @@ #include "saber/core/target_traits.h" #include "saber/core/data_traits.h" #include +#ifdef AMD_GPU +#include "saber/core/impl/amd/utils/amd_common.h" +#endif namespace anakin{ @@ -464,8 +467,8 @@ struct TargetWrapper { typedef typename DataTraitBase::PtrDtype TPtr; - typedef cl_event event_t; - typedef cl_command_queue stream_t; + typedef AMDEvent_t event_t; + typedef AMDStream_t stream_t; static void get_device_count(int& count); @@ -496,7 +499,7 @@ struct TargetWrapper { static void destroy_event(event_t event); - static void record_event(event_t event, stream_t stream); + static void record_event(event_t& event, stream_t stream); static void query_event(event_t event); @@ -563,15 +566,15 @@ struct TargetWrapper { //static cl_platform_id platform_id; //static cl_uint device_nums; static int current_device_id_index; - static std::map buffers; + static std::map buffers; //static cl_context* contexts; }; template <> struct TargetWrapper { - typedef cl_event event_t; - typedef cl_command_queue stream_t; + typedef AMDEvent_t event_t; + typedef AMDStream_t stream_t; static void get_device_count(int& count); @@ -587,7 +590,7 @@ struct TargetWrapper { static void destroy_event(event_t event); - static void record_event(event_t event, stream_t stream); + static void record_event(event_t& event, stream_t stream); static void create_stream(stream_t* stream); diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h index b1874e5b6..314ddbc91 100644 --- a/saber/funcs/activation.h +++ b/saber/funcs/activation.h @@ -29,8 +29,8 @@ #include "saber/funcs/impl/x86/saber_activation.h" #endif -#ifdef AMD_GPU -#include "saber/funcs/impl/amd/saber_activation.h" +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_activation.h" #endif #ifdef USE_ARM_PLACE diff --git a/saber/funcs/impl/amd/amd_utils.cpp b/saber/funcs/impl/amd/amd_utils.cpp deleted file mode 100644 index 3412eea80..000000000 --- a/saber/funcs/impl/amd/amd_utils.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#include "saber/funcs/impl/amd/amd_utils.h" -#include "utils/logger/logger.h" - -namespace anakin { -namespace saber { -#define MAX_LOG_LENGTH 65535 -cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName, - KernelInfo* ki) { - cl_int errNum; - cl_program program; - - std::ifstream kFile(fileName, std::ios::in); - - if (!kFile.is_open()) { - LOG(ERROR) << "Failed to open file for reading: " << fileName; - return NULL; - } - - std::string src( - (std::istreambuf_iterator(kFile)), - std::istreambuf_iterator() - ); - char* srcStr = src.c_str(); - program = clCreateProgramWithSource(context, 1, (const char**)&srcStr, NULL, NULL); - - kFile.close(); - - if (program == NULL) { - LOG(ERROR) << "Failed to create CL program with source file."; - return NULL; - } - - char* comp_options = NULL; - - if (ki != NULL) { - comp_options = ki->comp_options.c_str(); - } - - errNum = clBuildProgram(program, 1, &device, comp_options, NULL, NULL); - - if (errNum != CL_SUCCESS) { - char buildErrLog[MAX_LOG_LENGTH]; - clGetProgramBuildInfo(program, - device, - CL_PROGRAM_BUILD_LOG, - sizeof(buildErrLog), - buildErrLog, - NULL); - - LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog; - clReleaseProgram(program); - return NULL; - } - - return program; -}; - -cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device, - const char* binFile) { - cl_program program; - cl_int errNum; - - FILE* fp = fopen(binFile, "rb"); - - if (fp == NULL) { - LOG(ERROR) << "Can't open bin file: " << std::string(binFile); - return NULL; - } - - size_t binSize; - fseek(fp, 0, SEEK_END); - binSize = ftell(fp); - rewind(fp); - - unsigned char* binProgram = new unsigned char[binSize]; - fread(binProgram, 1, binSize, fp); - fclose(fp); - - program = clCreateProgramWithBinary(context, 1, &device, &binSize, - (const unsigned char**)&binProgram, NULL, &errNum); - errNum = clBuildProgram(program, 1, &device, "", NULL, NULL); - - delete[] binProgram; - - if (errNum != CL_SUCCESS) { - char buildErrLog[MAX_LOG_LENGTH]; - clGetProgramBuildInfo(program, - device, - CL_PROGRAM_BUILD_LOG, - sizeof(buildErrLog), - buildErrLog, - NULL); - - LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog; - clReleaseProgram(program); - - return NULL; - } - - return program; -}; - - -} -} diff --git a/saber/funcs/impl/amd/amd_utils.h b/saber/funcs/impl/amd/amd_utils.h deleted file mode 100644 index e55229014..000000000 --- a/saber/funcs/impl/amd/amd_utils.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H -#define ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H - -#include - -#include -#include -#include -#include - -#define MLO_POOLING_OP_MAX 0 -#define MLO_POOLING_OP_AVE 1 - -namespace anakin { -namespace saber { - -typedef struct ExtSolutionConfigTpye -{ - int in_tile0, in_tile1; - int grp_tile0, grp_tile1; - int out_pix_tile0, out_pix_tile1; - int n_stacks; - int n_out_pix_tiles; - int n_out_tiles_perstack; - int n_in_data_tiles; - int n_read_procs; - int alu_tile0, alu_tile1; - int horiz_out_pix; - int vert_out_pix; -}T_ExtSolutionConfig; - -struct KernelInfo -{ - std::string comp_options; - std::vector l_wk; - std::vector g_wk; - std::string kernel_file; - std::string kernel_name; - friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k); -}; - -extern cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName, KernelInfo* ki=NULL); -extern cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device, const char* binFile); - - -inline cl_int _setKernelArgs(cl_kernel &k,int i){ return CL_SUCCESS;} - -template -inline cl_int _setKernelArgs(cl_kernel &kernel,int i, const T &firstParameter, const Args& ...restOfParameters){ - return clSetKernelArg(kernel, i, sizeof(firstParameter), &firstParameter) | \ - _setKernelArgs(kernel,i+1,restOfParameters...); -} - -template -inline cl_int setKernelArgs(cl_kernel &kernel, const Args& ...args){ - return _setKernelArgs(kernel, 0, args...); -} -} -} -#endif diff --git a/saber/funcs/impl/amd/cl/ConvFwd3x3.cl b/saber/funcs/impl/amd/cl/ConvFwd3x3.cl deleted file mode 100644 index ddc6272c1..000000000 --- a/saber/funcs/impl/amd/cl/ConvFwd3x3.cl +++ /dev/null @@ -1,925 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -/* Compiler options: --c -emit-llvm -target amdgcn-amd-amdhsa-amdgizcl -x cl -cl-kernel-arg-info - --DMLO_HW_WAVE_SZ=64 --DMLO_DIR_FORWARD=1 --DMLO_FILTER_SIZE0=3 --DMLO_FILTER_SIZE1=3 --DMLO_FILTER_PAD0=1 --DMLO_FILTER_PAD1=1 --DMLO_FILTER_STRIDE0=1 --DMLO_FILTER_STRIDE1=1 --DMLO_N_OUTPUTS=64 --DMLO_N_INPUTS=3 --DMLO_BATCH_SZ=2 --DMLO_OUT_WIDTH=224 --DMLO_OUT_HEIGHT=224 --DMLO_OUT_BATCH_STRIDE=3211264 --DMLO_OUT_CHANNEL_STRIDE=50176 --DMLO_OUT_STRIDE=224 --DMLO_IN_WIDTH=224 --DMLO_IN_HEIGHT=224 --DMLO_IN_BATCH_STRIDE=150528 --DMLO_IN_CHANNEL_STRIDE=50176 --DMLO_IN_STRIDE=224 --DMLO_IN_TILE0=32 --DMLO_IN_TILE1=32 --DMLO_GRP_TILE0=16 --DMLO_GRP_TILE1=16 --DMLO_OUT_TILE0=2 --DMLO_OUT_TILE1=2 --DMLO_N_STACKS=1 --DMLO_N_OUT_TILES=8 --DMLO_N_OUT_TILES_PERSTACK=8 --DMLO_N_IN_TILES_PERSTACK=2 --DMLO_N_READ_PROCS=256 --DMLO_CONV_BIAS=0 --DMLO_ALU_VTILE0=16 --DMLO_ALU_VTILE1=16 - --cl-std=CL1.2 -O3 --mcpu=gfx900 --mllvm --amdgpu-early-inline-all --mllvm -amdgpu-prelink - --D__AMD__=1 --D__gfx900__=1 --D__gfx900=1 --D__OPENCL_VERSION__=120 --D__IMAGE_SUPPORT__=1 - --Xclang --cl-ext=+cl_khr_fp64,+cl_khr_global_int32_base_atomics,+cl_khr_global_int32_extended_atomics,+cl_khr_local_int32_base_atomics,+cl_khr_local_int32_extended_atomics,+cl_khr_int64_base_atomics,+cl_khr_int64_extended_atomics,+cl_khr_3d_image_writes,+cl_khr_byte_addressable_store,+cl_khr_fp16,+cl_khr_gl_sharing,+cl_amd_device_attribute_query,+cl_amd_media_ops,+cl_amd_media_ops2,+cl_khr_subgroups,+cl_amd_copy_buffer_p2p,+cl_amd_assembly_program -include opencl-c.h -*/ - -#define _FLOAT float -#define _FLOAT2 float2 -#define _FLOAT4 float4 -#define _FLOAT8 float8 - -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+38F /* max value */ -#endif - - - -//HCJ definitions generated by solver -//#define MLO_HW_WAVE_SZ 64 -//#define MLO_DIR_FORWARD 1 -//#define MLO_FILTER_SIZE0 3 -//#define MLO_FILTER_SIZE1 3 -//#define MLO_FILTER_PAD0 1 -//#define MLO_FILTER_PAD1 1 -//#define MLO_FILTER_STRIDE0 1 -//#define MLO_FILTER_STRIDE1 1 -//#define MLO_N_OUTPUTS 64 -//#define MLO_N_INPUTS 3 -//#define MLO_BATCH_SZ 2 -//#define MLO_OUT_WIDTH 224 -//#define MLO_OUT_HEIGHT 224 -//#define MLO_OUT_BATCH_STRIDE 3211264 -//#define MLO_OUT_CHANNEL_STRIDE 50176 -//#define MLO_OUT_STRIDE 224 -//#define MLO_IN_WIDTH 224 -//#define MLO_IN_HEIGHT 224 -//#define MLO_IN_BATCH_STRIDE 150528 -//#define MLO_IN_CHANNEL_STRIDE 50176 -//#define MLO_IN_STRIDE 224 -//#define MLO_IN_TILE0 32 -//#define MLO_IN_TILE1 32 -//#define MLO_GRP_TILE0 16 -//#define MLO_GRP_TILE1 16 -//#define MLO_OUT_TILE0 2 -//#define MLO_OUT_TILE1 2 -//#define MLO_N_STACKS 1 -//#define MLO_N_OUT_TILES 8 -//#define MLO_N_OUT_TILES_PERSTACK 8 -//#define MLO_N_IN_TILES_PERSTACK 2 -//#define MLO_N_READ_PROCS 256 -//#define MLO_CONV_BIAS 0 -//#define MLO_ALU_VTILE0 16 -//#define MLO_ALU_VTILE1 16 -#define MLO_CONV_BIAS 1 - -#define UNUSED __attribute__((__unused__)) - -#ifndef MLO_FILTER_STRIDE0 -#define MLO_FILTER_STRIDE0 1 -#endif -#ifndef MLO_FILTER_STRIDE1 -#define MLO_FILTER_STRIDE1 1 -#endif - -#define MLO_FILTER_SZ (MLO_FILTER_SIZE1 * MLO_FILTER_SIZE0) - -#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1) -#define MLO_GRP_SZ1 1 -#define MLO_GRP_SZ2 1 -#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2) -#define MLO_N_PROC_WAVES ((MLO_GRP_SZ + MLO_N_READ_PROCS - 1) / MLO_N_READ_PROCS) -#define MLO_OUT_TILE_SZ (MLO_OUT_TILE1 * MLO_OUT_TILE0) -#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0) - -#if MLO_IN_TILE0 < MLO_OUT_WIDTH || MLO_IN_TILE1 < MLO_OUT_HEIGHT -#define MLO_LARGE_MAP 1 -#else -#define MLO_LARGE_MAP 0 -#endif - -#if(MLO_IN_WIDTH == MLO_OUT_WIDTH && \ - (MLO_IN_WIDTH / MLO_IN_TILE0) * MLO_IN_TILE0 == MLO_IN_WIDTH && \ - MLO_IN_HEIGHT == MLO_OUT_HEIGHT && \ - (MLO_IN_HEIGHT / MLO_IN_TILE1) * MLO_IN_TILE1 == MLO_IN_HEIGHT) -#define MLO_OUT_ALIGNED 1 -#else -#define MLO_OUT_ALIGNED 0 -#endif - -#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ)) -#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS) -#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ) -#define MLO_N_IN_TILES_TOTAL (MLO_N_IN_TILES_PERSTACK * MLO_N_STACKS) -/* -#define MLO_N_OUT_TILES_PERSTACK (MLO_N_OUT_TILES*MLO_N_ALUTILES_PERSTACK) -#if MLO_N_OUT_TILES_PERSTACK > MLO_N_OUTPUTS -#undef MLO_N_OUT_TILES_PERSTACK -#define MLO_N_OUT_TILES_PERSTACK MLO_N_OUTPUTS -#endif -*/ -#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0) -#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1) -#define MLO_N_IN_PACKS ((MLO_N_INPUTS + MLO_N_IN_TILES_PERSTACK - 1) / MLO_N_IN_TILES_PERSTACK) - -#define MLO_N_IN_READ (MLO_N_IN_PACKS * MLO_N_IN_TILES_PERSTACK) -#if MLO_N_IN_READ == MLO_N_INPUTS -#define MLO_INPUTS_ALIGNED 1 -#else -#define MLO_INPUTS_ALIGNED 0 -#endif - -#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK) -#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \ - MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS -#define MLO_OUTPUTS_ALIGNED 1 -#else -#define MLO_OUTPUTS_ALIGNED 0 -#endif - -#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS) -#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ -#define MLO_BATCH_ALIGNED 1 -#else -#define MLO_BATCH_ALIGNED 0 -#endif - -#if MLO_DIR_FORWARD == 1 -#define MLO_IN_LCL_WIDTH \ - ((MLO_IN_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0) // here we use kernel size. it's - // important when padding == 0 2* - // MLO_FILTER_PAD0 -#define MLO_IN_LCL_HEIGHT ((MLO_IN_TILE1 - 1) * MLO_FILTER_STRIDE1 + MLO_FILTER_SIZE1) -#else -#define MLO_IN_LCL_WIDTH \ - ((MLO_IN_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / \ - MLO_FILTER_STRIDE0) // here we use kernel size. it's important when padding == 0 2* -// MLO_FILTER_PAD0 -#define MLO_IN_LCL_HEIGHT \ - ((MLO_IN_TILE1 + MLO_FILTER_SIZE1 - 1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1) -#endif -#define MLO_IN_LCL_TILE_SZ (MLO_IN_LCL_WIDTH * MLO_IN_LCL_HEIGHT) -#define MLO_IN_LCL_PERSTACK_SZ (MLO_IN_LCL_TILE_SZ * MLO_N_IN_TILES_PERSTACK) -#define MLO_IN_LCL_SZ (MLO_IN_LCL_PERSTACK_SZ * MLO_N_STACKS) - -#define MLO_WEIGHTS_SZ (MLO_N_OUT_TILES_PERSTACK * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - -#define MLO_PVT_ACCUM_DATA_SZ (MLO_N_OUT_TILES * MLO_OUT_TILE_SZ) -#if MLO_DIR_FORWARD == 1 -#define MLO_PVT_IN_WIDTH ((MLO_OUT_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0) -#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 - 1) * MLO_FILTER_STRIDE1 + 1) -#else -#define MLO_PVT_IN_WIDTH \ - ((MLO_OUT_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / MLO_FILTER_STRIDE0) -#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1) -#endif - -#define MLO_LCL_WEIGHTS 1 - -#define MLO_PADDING_SHIFT1 (MLO_FILTER_SIZE1 - MLO_FILTER_PAD1 - 1) -#define MLO_PADDING_SHIFT0 (MLO_FILTER_SIZE0 - MLO_FILTER_PAD0 - 1) - -#define MLO_PADDING_FIX1 (MLO_FILTER_SIZE1 % MLO_OUT_TILE1) -#define MLO_PADDING_FIX0 (MLO_FILTER_SIZE0 % MLO_OUT_TILE0) - -#if defined(__AMDGCN__) -extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane"); -#define uniform(x) __llvm_amdgcn_readfirstlane(x) -#else -#define uniform(x) (x) -#endif - -static inline uint iDiv(uint v, uint d) -{ - uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f); - return (r); -} - -static inline uint iMod(uint v, uint u, uint d) -{ - uint r = v - mul24((uint)u, (uint)d); - return (r); -} - -static inline void calculateXYPos(uint linPos, uint width, uint* __restrict x, uint* __restrict y) -{ - (*y) = (uint)((float)linPos * (1.0f / (float)width) + 0.00001f); - (*x) = linPos - mul24((*y), width); -} - -static inline uint calculateOffset(uint stride, uint x, uint y) -{ - uint ret = y * stride + x; - return (ret); -} - -static inline void readDataElem(uint linPos, - __local _FLOAT* lcl_data, - int lcl_base, - UNUSED uint lcl_height, - uint lcl_width, - int lcl_stride, - int lcl_y, - int lcl_x, - const __global _FLOAT* gbl_data, - int gbl_base, - uint gbl_height, - uint gbl_width, - int gbl_stride, - int gbl_y, - int gbl_x, - bool vis, - UNUSED bool debug) -{ - uint x, y; - calculateXYPos(linPos, lcl_width, &x, &y); - int g_x = x + gbl_x; - int g_y = y + gbl_y; - uint gbl_off0 = calculateOffset(gbl_stride, g_x, g_y); - int gbl_off = gbl_off0 + gbl_base; - -#if MLO_LARGE_MAP == 1 - int lcl_off = lcl_base + linPos; - (void)lcl_stride; - (void)lcl_x; - (void)lcl_y; -#else - int l_x = x + lcl_x; - int l_y = y + lcl_y; - int lcl_off = lcl_base + mad24(l_y, lcl_stride, l_x); -#endif - -#if MLO_LARGE_MAP == 1 - vis &= (g_x >= 0 && g_x < gbl_width && g_y >= 0 && g_y < gbl_height); -#else - (void)gbl_width; - (void)gbl_height; -#endif - gbl_off = (vis) ? gbl_off : 0; - _FLOAT gbl_val = gbl_data[gbl_off]; - gbl_val = (vis) ? gbl_val : 0; - - lcl_data[lcl_off] = gbl_val; -} - -static inline void readData(uint lcl_id, - int size, - int lcl_p_stride, - __local _FLOAT* lcl_data, - int lcl_base, - uint lcl_height, - uint lcl_width, - int lcl_stride, - int lcl_y, - int lcl_x, - const __global _FLOAT* gbl_data, - int gbl_base, - uint gbl_height, - uint gbl_width, - int gbl_stride, - int gbl_y, - int gbl_x, - bool vis, - bool debug) -{ - - for(uint i = lcl_id; i < size; i += lcl_p_stride) - { - readDataElem(i, - lcl_data, - lcl_base, - lcl_height, - lcl_width, - lcl_stride, - lcl_y, - lcl_x, - gbl_data, - gbl_base, - gbl_height, - gbl_width, - gbl_stride, - gbl_y, - gbl_x, - vis, - debug); - } -} - -static inline void loadData(uint lcl_id, - int lcl_p_stride, - __local _FLOAT* lcl_data, - int lcl_off, - int lcl_size, - uint lcl_height, - uint lcl_width, - int lcl_stride, - int lcl_bot_y, - int lcl_bot_x, - const __global _FLOAT* gbl_data, - int gbl_off, - int gbl_size, - uint gbl_height, - uint glb_width, - int gbl_stride, - int gbl_bot_y, - int gbl_bot_x, - int buf_block_ind, - int max_n_bufs, - int lcl_n_bufs, - bool debug) -{ - - for(uint c = 0; c < lcl_n_bufs; ++c, lcl_off += lcl_size, gbl_off += gbl_size) - { - bool vis = (buf_block_ind + c < max_n_bufs); - readData(lcl_id, - lcl_size, - lcl_p_stride, - lcl_data, - lcl_off, - lcl_height, - lcl_width, - lcl_stride, - lcl_bot_y, - lcl_bot_x, - gbl_data, - gbl_off, - gbl_height, - glb_width, - gbl_stride, - gbl_bot_y, - gbl_bot_x, - vis, - (debug)); - } -} - -static inline void Conv(uint o_map_base, - int in_stg_off, - __private _FLOAT* __restrict pvt_in_stage, - __local _FLOAT* __restrict lcl_indata, - __private _FLOAT* __restrict pvt_wei_stage, - __local _FLOAT* __restrict lcl_wei, - __private _FLOAT* __restrict pvt_accum) -{ - // convolution - - // over all inputs in stack - int in_stg_off1 = in_stg_off; - for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 += MLO_IN_LCL_TILE_SZ) - { - // preload input - int wei_stg_base_off = mad24(o_map_base, - (uint)(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ), - mul24(i_c, (uint)MLO_FILTER_SZ)); - int in_stg_off2 = in_stg_off1; - for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j, - in_stg_off2 += (((j - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1) - ? 0 - : MLO_IN_LCL_WIDTH)) - { - for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i) - { - pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] = lcl_indata[in_stg_off2 + i]; - } - } - -// over filter rows -#ifdef __AMDGCN__ -#if MLO_FILTER_SIZE1 < 6 -#pragma unroll -#elif MLO_FILTER_SIZE1 < 9 -#pragma unroll 2 -#endif -#endif -#if MLO_DIR_FORWARD == 1 - for(uint k = 0; k < MLO_FILTER_SIZE1; ++k, in_stg_off2 += MLO_IN_LCL_WIDTH) -#else - for(uint k = 0; k < MLO_FILTER_SIZE1; ++k, - in_stg_off2 += (((k - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1) - ? 0 - : MLO_IN_LCL_WIDTH)) -#endif - { - int k_act = 0; -#if MLO_DIR_FORWARD == 1 - k_act = k; -#else - // load filter in reverse order - k_act = MLO_FILTER_SIZE1 - 1 - k; -#endif - // load next input row - for(uint i_pvt = 0; i_pvt < MLO_PVT_IN_WIDTH; ++i_pvt) - { - pvt_in_stage[(MLO_PVT_IN_HEIGHT - 1) * MLO_PVT_IN_WIDTH + i_pvt] = - lcl_indata[in_stg_off2 + i_pvt]; - } - - // over all outputs - for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c) - { - int wei_stg_off = wei_stg_base_off + o_c * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ + - k_act * MLO_FILTER_SIZE0; - for(uint i = 0; i < MLO_FILTER_SIZE0; ++i) - { - pvt_wei_stage[i] = - lcl_wei[wei_stg_off + - i]; //(float)o_c/(float)MLO_N_OUT_TILES + (float)(i+k)/9; - } - - // actual conv - - for(uint j = 0; j < MLO_OUT_TILE1; ++j) - { -#if MLO_DIR_FORWARD == 0 - if(((j + k + 1 - MLO_PADDING_SHIFT1 + (MLO_FILTER_SIZE1 % MLO_FILTER_STRIDE1)) % - MLO_FILTER_STRIDE1) == 0) -#endif - for(uint i = 0; i < MLO_OUT_TILE0; ++i) - { - for(uint l = 0; l < MLO_FILTER_SIZE0; ++l) - { - - int l_act = 0; -#if MLO_DIR_FORWARD == 1 - l_act = l; - -#else - // in reverse horizontal and vertical orders - l_act = MLO_FILTER_SIZE0 - 1 - l; - -#endif - -#if MLO_DIR_FORWARD == 1 - pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] += - pvt_in_stage[j * MLO_PVT_IN_WIDTH * MLO_FILTER_STRIDE1 + - i * MLO_FILTER_STRIDE0 + l] * - pvt_wei_stage[l_act]; -#else - if(((i + l + 1 - MLO_PADDING_SHIFT0 + - (MLO_FILTER_SIZE0 % MLO_FILTER_STRIDE0)) % - MLO_FILTER_STRIDE0) == 0) - { - pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] += - pvt_in_stage[(j / MLO_FILTER_STRIDE1) * MLO_PVT_IN_WIDTH + - (i + l) / MLO_FILTER_STRIDE0] * - pvt_wei_stage[l_act]; - } -#endif - } - } - } - - } // for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c) - - // move data up - for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j) - { - for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i) - { - pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] = - pvt_in_stage[(j + 1) * MLO_PVT_IN_WIDTH + i]; - } - } - - } // for(uint k = 0; k < MLO_FILER_SIZE1; ++k,in_stg_off2+=MLO_IN_LCL_WIDTH) - - } // for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 += - // MLO_IN_LCL_PERSTACK_SZ) -} - -__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void -ConvFwd3x3(const __global _FLOAT* __restrict in, - const __global _FLOAT* __restrict weights, -#if MLO_CONV_BIAS - const __global _FLOAT* __restrict bias, -#endif - __global _FLOAT* __restrict out, - _FLOAT slope) -{ - -#if 1 - __local _FLOAT lcl_indata[MLO_IN_LCL_SZ]; - __local _FLOAT lcl_wei[MLO_WEIGHTS_SZ]; - __private _FLOAT pvt_accum[MLO_PVT_ACCUM_DATA_SZ]; - __private _FLOAT pvt_in_stage[MLO_PVT_IN_HEIGHT * MLO_PVT_IN_WIDTH]; - __private _FLOAT pvt_wei_stage[MLO_FILTER_SIZE0]; - - uint grp_id0 = get_group_id(0); -#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1) - uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0); - uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0); -#else - uint y_tile_blk = grp_id0 / MLO_N_OUT_TILE_BLOCKS0; - uint x_tile_blk = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1); -#endif - uint o_pack = get_group_id(1); // block of outputs - uint b_pack = get_group_id(2); // batch block - - uint lcl_id = get_local_id(0); -#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ - uint stack = 0; - uint alu_stack_id = lcl_id; -#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1) - uint stack = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ); // stack - uint alu_stack_id = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack -#else - uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack - uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack -#if MLO_ALUTILES_STACK_SZ >= 64 - stack = uniform(stack); -#endif -#endif -// ALU plane inside stack -#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1) - uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index - uint alu_out_id = iMod( - alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane -#else - uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ; // alu output plane index - uint alu_out_id = alu_stack_id & (MLO_ALU_TILE_SZ - 1); // alu index inside an ALU output plane -#endif -// pos inside ALU tile -#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1) - uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0); - uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0); -#else - uint alu_tl1 = alu_out_id / MLO_ALU_VTILE0; - uint alu_tl0 = alu_out_id & (MLO_ALU_VTILE0 - 1); -#endif - - uint o_map_plane = - o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack - uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset - uint o_map = o_map_plane + o_map_base; // output map index per ALU plane - uint b_index = b_pack * MLO_N_STACKS; - -#if MLO_LARGE_MAP != 1 -#if MLO_N_READ_PROCS >= MLO_GRP_SZ - uint wave_id = 0; - uint wave_lcl_id = lcl_id; -#elif MLO_N_READ_PROCS & (MLO_N_READ_PROCS - 1) - uint wave_id = iDiv(lcl_id, MLO_N_READ_PROCS); - uint wave_lcl_id = iMod(lcl_id, wave_id, MLO_N_READ_PROCS); -#else - uint wave_id = lcl_id / MLO_N_READ_PROCS; - uint wave_lcl_id = lcl_id & (MLO_N_READ_PROCS - 1); -#if MLO_N_READ_PROCS >= 64 - wave_id = uniform(wave_id); -#endif -#endif -#endif - -#if MLO_DIR_FORWARD == 1 - uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0; - uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1; -#if MLO_LARGE_MAP == 1 - uint x_in_grp = x_grp - MLO_FILTER_PAD0; - uint y_in_grp = y_grp - MLO_FILTER_PAD1; -#endif - uint x_in_lcl = alu_tl0 * MLO_OUT_TILE0 * MLO_FILTER_STRIDE0; - uint y_in_lcl = alu_tl1 * MLO_OUT_TILE1 * MLO_FILTER_STRIDE1; -#else - uint x_grp = x_tile_blk * (MLO_IN_TILE0 / MLO_FILTER_STRIDE0); - uint y_grp = y_tile_blk * (MLO_IN_TILE1 / MLO_FILTER_STRIDE1); -#if MLO_LARGE_MAP == 1 - uint x_in_grp = x_grp - (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE0); - uint y_in_grp = y_grp - (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE1); -#endif - uint x_in_lcl = alu_tl0 * (MLO_OUT_TILE0 / MLO_FILTER_STRIDE0); - uint y_in_lcl = alu_tl1 * (MLO_OUT_TILE1 / MLO_FILTER_STRIDE1); -#endif - - // base offset to read data from local input data - uint in_stg_off = stack * MLO_IN_LCL_PERSTACK_SZ + (y_in_lcl)*MLO_IN_LCL_WIDTH + x_in_lcl; - - uint in_off = b_index * MLO_IN_BATCH_STRIDE; - -#if MLO_DIR_FORWARD == 1 - uint wei_off = mul24(o_map_plane, (uint)(MLO_N_INPUTS * MLO_FILTER_SZ)); -#else - uint wei_off = mul24(o_map_plane, (uint)MLO_FILTER_SZ); -#endif - -#if MLO_LARGE_MAP == 0 - for(uint i = lcl_id; i < MLO_IN_LCL_SZ; i += MLO_GRP_SZ) - { - lcl_indata[i] = 0; - } -#endif - - for(uint i = 0; i < MLO_PVT_ACCUM_DATA_SZ; ++i) - { - pvt_accum[i] = 0; - } - - for(uint ic = 0; ic < MLO_N_INPUTS; ic += MLO_N_IN_TILES_PERSTACK, - in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_IN_TILES_PERSTACK, - wei_off += MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ -#if MLO_DIR_FORWARD == 0 - * - MLO_N_OUTPUTS -#endif - ) - { - barrier(CLK_LOCAL_MEM_FENCE); - -// small map has been read in full continiously into the lDS buffer within padded rect, -// padding has been done on initilization. -// large map calculates padding on the fly and fills it with 0. - -#if 1 // all inputs - -#if MLO_LARGE_MAP == 1 - int in_lcl_off1 = 0; - int in_off1 = in_off; - for(uint i_b = 0; i_b < MLO_N_STACKS; - ++i_b, in_off1 += MLO_IN_BATCH_STRIDE, in_lcl_off1 += MLO_IN_LCL_PERSTACK_SZ) - { - bool vis = true; -#if MLO_BATCH_ALIGNED == 0 - vis &= (b_index + i_b < MLO_BATCH_SZ); -#endif - - // over all inputs in stack - int in_off2 = in_off1; - int in_lcl_off2 = in_lcl_off1; - for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; - ++i_c, in_off2 += MLO_IN_CHANNEL_STRIDE, in_lcl_off2 += MLO_IN_LCL_TILE_SZ) - { -#if MLO_INPUTS_ALIGNED == 0 - vis &= (ic + i_c < MLO_N_INPUTS); -#endif - - uint elem_id = lcl_id; - int lcl_p_stride = MLO_GRP_SZ0; - int lcl_base = 0; - int lcl_y = 0; - int lcl_x = 0; - int gbl_base = in_off2; - - readData(elem_id, - (MLO_IN_LCL_HEIGHT * MLO_IN_LCL_WIDTH), - lcl_p_stride, - &lcl_indata[in_lcl_off2], - lcl_base, - MLO_IN_LCL_HEIGHT, - MLO_IN_LCL_WIDTH, - MLO_IN_LCL_WIDTH, - lcl_y, - lcl_x, - &in[0], - gbl_base, - MLO_IN_HEIGHT, - MLO_IN_WIDTH, - MLO_IN_STRIDE, - y_in_grp, - x_in_grp, - vis, - true); - } - } -#else - for(uint i = wave_id; i < MLO_N_IN_TILES_TOTAL; i += MLO_N_PROC_WAVES) - { -#if MLO_N_IN_TILES_PERSTACK & (MLO_N_IN_TILES_PERSTACK - 1) - uint i_b = iDiv(i, MLO_N_IN_TILES_PERSTACK); - uint i_c = iMod(i, i_b, MLO_N_IN_TILES_PERSTACK); -#else - uint i_b = i / MLO_N_IN_TILES_PERSTACK; - uint i_c = i & (MLO_N_IN_TILES_PERSTACK - 1); -#endif - - bool vis = true; - -#if MLO_BATCH_ALIGNED == 0 - vis &= (b_index + i_b < MLO_BATCH_SZ); -#endif - -#if MLO_INPUTS_ALIGNED == 0 - vis &= (ic + i_c < MLO_N_INPUTS); -#endif - int in_off2 = in_off + i_b * MLO_IN_BATCH_STRIDE + i_c * MLO_IN_CHANNEL_STRIDE; - int in_lcl_off2 = i_b * MLO_IN_LCL_PERSTACK_SZ + i_c * MLO_IN_LCL_TILE_SZ; - - uint elem_id = wave_lcl_id; - int lcl_p_stride = MLO_N_READ_PROCS; - int lcl_base = 0; -#if MLO_DIR_FORWARD == 1 - int lcl_y = MLO_FILTER_PAD1; - int lcl_x = MLO_FILTER_PAD0; -#else - int lcl_y = (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE0); - int lcl_x = (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE1); -#endif - int gbl_base = in_off2; - - readData(elem_id, - (MLO_IN_HEIGHT * MLO_IN_WIDTH), - lcl_p_stride, - &lcl_indata[in_lcl_off2], - lcl_base, - MLO_IN_HEIGHT, - MLO_IN_WIDTH, - MLO_IN_LCL_WIDTH, - lcl_y, - lcl_x, - &in[0], - gbl_base, - MLO_IN_HEIGHT, - MLO_IN_WIDTH, - MLO_IN_STRIDE, - y_grp, - x_grp, - vis, - true); - } -#endif - -// read inputs and weights -// put weights into LDS - -#if 1 // only weights - - for(uint i = lcl_id; i < MLO_WEIGHTS_SZ; i += MLO_GRP_SZ) - { -#if MLO_DIR_FORWARD == 1 -// here is [tops][bottoms] -#if(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1) - uint lcl_o = iDiv(i, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ)); - uint gbl_i = iMod(i, lcl_o, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ)); -#else - uint lcl_o = i / (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ); - uint gbl_i = i & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1); -#endif - uint gbl_we_off = wei_off + lcl_o * MLO_N_INPUTS * MLO_FILTER_SZ + gbl_i; - bool within_range = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ); - - gbl_we_off = (within_range) ? gbl_we_off : 0; - _FLOAT wei = weights[gbl_we_off]; - wei = (within_range) ? wei : 0; - lcl_wei[i] = wei; -#else -// outputs are botoms(inputs)) -// inputs are tops(outputs) -#if(MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1) - uint lcl_o = iDiv(i, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ)); - uint gbl_i = iMod(i, lcl_o, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ)); -#else - uint lcl_o = i / (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ); - uint gbl_i = i & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1); -#endif -#if MLO_FILTER_SZ & (MLO_FILTER_SZ - 1) - uint lcl_c = iDiv(gbl_i, MLO_FILTER_SZ); - uint lcl_i = iMod(gbl_i, lcl_c, MLO_FILTER_SZ); -#else - uint lcl_c = gbl_i / MLO_FILTER_SZ; - uint lcl_i = gbl_i & (MLO_FILTER_SZ - 1); -#endif - - uint lcl_we_off = mad24( - mad24(lcl_c, (uint)MLO_N_IN_TILES_PERSTACK, lcl_o), (uint)MLO_FILTER_SZ, lcl_i); - uint gbl_we_off = mad24( - mad24(lcl_o, (uint)MLO_N_OUTPUTS, lcl_c), (uint)MLO_FILTER_SZ, wei_off + lcl_i); - bool within_range = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ); - gbl_we_off = (within_range) ? gbl_we_off : 0; - _FLOAT wei = weights[gbl_we_off]; - wei = (within_range) ? wei : 0; - lcl_wei[lcl_we_off] = wei; - -#endif - } - -#endif - -// over all batch stacks - -#endif // all input - - barrier(CLK_LOCAL_MEM_FENCE); - - // convolution - Conv(o_map_base, in_stg_off, pvt_in_stage, lcl_indata, pvt_wei_stage, lcl_wei, pvt_accum); - - // barrier(CLK_LOCAL_MEM_FENCE); - } -// write results out -#if MLO_DIR_FORWARD == 1 -#if MLO_FILTER_STRIDE0 == 1 - int x_out_grp = x_grp; -#else - int x_out_grp = x_tile_blk * MLO_IN_TILE0; -#endif -#if MLO_FILTER_STRIDE1 == 1 - int y_out_grp = y_grp; -#else - int y_out_grp = y_tile_blk * MLO_IN_TILE1; -#endif -#else - int x_out_grp = x_grp * MLO_FILTER_STRIDE0; - int y_out_grp = y_grp * MLO_FILTER_STRIDE1; -#endif - int x_out_lcl = alu_tl0 * MLO_OUT_TILE0; - int y_out_lcl = alu_tl1 * MLO_OUT_TILE1; - - uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE + - (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl; -// over all local stacks -#if MLO_BATCH_ALIGNED == 0 - if(b_index + stack < MLO_BATCH_SZ) -#endif - { - - // over all local outputs - int out_off1 = out_off; - for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE) - { -#if MLO_OUTPUTS_ALIGNED == 0 - if(o_map + o < MLO_N_OUTPUTS) -#endif - { - // over output tile - int out_off2 = out_off1; -#if MLO_OUT_TILE0 == 1 - for(int j = 0; j < MLO_OUT_TILE1 && y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT; - ++j, out_off2 += MLO_OUT_STRIDE) - { - for(int i = 0; i < MLO_OUT_TILE0 && x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && - out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ; - ++i) - { -#else - for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE) - { - if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT) - for(uint i = 0; i < MLO_OUT_TILE0; ++i) - { - if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && - out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ) - { -#endif - out[out_off2 + i] = pvt_accum[o * MLO_OUT_TILE_SZ + j * MLO_OUT_TILE0 + i] -#if MLO_CONV_BIAS - + bias[o_map + o] -#endif - ; - //ReLU fusion - out[out_off2 + i] *= (out[out_off2 + i] > 0.0f ? 1.0f : slope); - } - } - } - } - } - } -#endif - - /*uint tid = get_local_id(0); - uint gid = get_group_id(0); - - __global _FLOAT* q = out + 256 * gid + tid; // 线性地址.(测试用) - *q = 256 * gid + tid;*/ -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl deleted file mode 100644 index 960a4e0a2..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (25088) -#define ITER (196) - -void reduce(__local float* buffer, int tid) -{ - if (tid < 64) - { - buffer[tid] += buffer[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) - { - buffer[tid << 3] += buffer[(tid << 3) + 4]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 4) - { - buffer[tid << 4] += buffer[(tid << 4) + 8]; - } - barrier(CLK_LOCAL_MEM_FENCE); -} - -__attribute__((reqd_work_group_size(128, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[129]; - __local float shared_b[8][65]; - - __local float result[2][129]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a + (grid_x >> 9) * STRIDE); - __global const float* pB = (__global const float*)(b + ((grid_x & 511) << 3) * STRIDE); - - int offset = ((grid_x << 6) + ((lid_x >> 6) * 12544) + (lid_x & 63)) % STRIDE; - - int temp_offset = offset; - for (int l = 0; l < 2; l++, offset = temp_offset) - { - float sum = 0.0f; - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - shared_a[lid_x] = pA[offset]; - - for (int j = l * 4; j < (l + 1) * 4; j++) - { - shared_b[(lid_x >> 6 << 2) + (j & 3)][(lid_x & 63)] = pB[offset + j * STRIDE]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 4; k++) - { - sum += shared_a[(lid_x >> 6 << 6) + ((lid_x & 15) << 2) + k] * shared_b[(lid_x >> 6 << 2) + ((lid_x & 63) >> 4)][((lid_x & 15) << 2) + k]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - result[l][lid_x] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - reduce(result[l], lid_x); - } - - if (lid_x < 8) - { - c[(grid_x << 3) + lid_x] = result[lid_x >> 2][(lid_x & 3) << 4] + bias[(grid_x << 3) + lid_x]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl deleted file mode 100644 index 568c241e6..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (25088) -#define CSTRIDE (4096) -#define ITER (392) - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[2][66]; - __local float shared_b[8][66]; - __local float result[65]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); - - int offset = ((grid_x << 6)) % STRIDE; - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 2; j++) - { - shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; - } - - for (int j = 0; j < 8; j++) - { - shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; - } - - for (int k = 0; k < 16; k++) - { - sum += shared_a[lid_x >> 5][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 31) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)]; - } - } - - result[lid_x] = sum; - reduce(result, lid_x); - - if (lid_x < 2) - { - float8 out; - float* pOut = (float*)&out; - - for (int i = 0; i < 8; i++) - { - pOut[i] = result[((lid_x * 8 + i) << 2)] + bias[(grid_x << 3) + i]; - } - - __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); - *pC = out; - } -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl deleted file mode 100644 index a135816b7..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -///////////////////////////////////////////////////////// -// FC6 batch 32 Version 3 2018.6.25 - -#define STRIDE (25088) -#define CSTRIDE (4096) -#define ITER (1568) - -__attribute__((reqd_work_group_size(256, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[1024]; - __local float shared_b[1024]; - __local float4* pShared_a = (__local float4*)shared_a; - __local float4* pShared_b = (__local float4*)shared_b; - - float4 sha; - float4 shb; - - float4 sum = 0.0f; - float* pSum = (float*)∑ - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE); - __global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2)); - - int offset = (grid_x << 5) % STRIDE; - - for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE) - { - for (int j = 0; j < 2; j++) - { - shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; - shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)]; - shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)]; - sum += sha * shb; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - for (int i = 0; i < 4; i++) - { - shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - float4 result; - float* pResult = (float*)&result; - - for (int i = 0; i < 4; i++) - { - pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i]; - } - - *pC = result; -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl deleted file mode 100644 index 3b3fec19b..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (25088) -#define CSTRIDE (4096) -#define ITER (392) - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[4][66]; - __local float shared_b[4][66]; - __local float result[65]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 2) * STRIDE); - - int offset = ((grid_x << 6)) % STRIDE; - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 4; j++) - { - shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; - shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; - } - - for (int k = 0; k < 16; k++) - { - sum += shared_a[lid_x >> 4][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 15) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)]; - } - } - - result[lid_x] = sum; - reduce(result, lid_x); - - if (lid_x < 4) - { - float4 out; - float* pOut = (float*)&out; - - for (int i = 0; i < 4; i++) - { - pOut[i] = result[((lid_x * 4 + i) << 2)] + bias[(grid_x << 2) + i]; - } - - __global float4* pC = (__global float4*)(c + (grid_x << 2) + lid_x * CSTRIDE); - *pC = out; - } -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl deleted file mode 100644 index 29cdb4a6a..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (25088) -#define CSTRIDE (4096) -#define ITER (392) - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[8][66]; - __local float shared_b[8][66]; - __local float result[65]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); - - int offset = ((grid_x << 6)) % STRIDE; - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 8; j++) - { - shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; - } - - for (int j = 0; j < 8; j++) - { - shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; - } - - for (int k = 0; k < 64; k++) - { - sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)]; - } - } - - result[lid_x] = sum; - - if (lid_x < 8) - { - float8 out; - float* pOut = (float*)&out; - - for (int i = 0; i < 8; i++) - { - pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i]; - } - - __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); - *pC = out; - } -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl deleted file mode 100644 index f166c449a..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define ITER (64) - -__attribute__((reqd_work_group_size(256, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_b[64][65]; - - __local float result[256]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __constant float* pA = (__constant float*)a; - __global const float* pB = (__global const float*)(b + ((grid_x << 6) + (lid_x >> 6 << 4)) * STRIDE + (lid_x & 63)); - - int offset = (grid_x << 7) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 16; j++) - { - shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = pB[offset + j * STRIDE]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sum += pA[offset + (lid_x >> 6 << 4) + k] * shared_b[lid_x & 63][(lid_x >> 6 << 4) + k];//shared_a[(lid_x >> 6 << 4) + k] - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - result[lid_x] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid_x < 64) - { - result[lid_x] += result[lid_x + 64] + result[lid_x + 128] + result[lid_x + 192]; - c[(grid_x << 6) + lid_x] = result[lid_x] + bias[(grid_x << 6) + lid_x]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl deleted file mode 100644 index fd4442971..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (4096) -#define ITER (64) - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_b[16][65]; - - __local float result[64]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __constant float* pA = (__constant float*)(a + (grid_x >> 8 << 12)); //correct - __global const float* pB = (__global const float*)(b + ((grid_x & 255) << 4) * STRIDE + lid_x); - - int offset = ((grid_x & 255) << 6) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 16; j++) - { - shared_b[j][lid_x] = pB[offset + j * STRIDE]; - } - //barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sum += pA[offset + (lid_x >> 4 << 4) + k] * shared_b[(lid_x & 15)][(lid_x >> 4 << 4) + k]; - } - //barrier(CLK_LOCAL_MEM_FENCE); - } - - result[lid_x] = sum; - //barrier(CLK_LOCAL_MEM_FENCE); - - if (lid_x < 16) - { - result[lid_x] += result[lid_x + 16] + result[lid_x + 32] + result[lid_x + 48]; - c[(grid_x << 4) + lid_x] = result[lid_x] + bias[((grid_x & 255) << 4) + lid_x]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl deleted file mode 100644 index 88a8659d1..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -///////////////////////////////////////////////////////// -// FC7 batch 32 Version 3 2018.6.25 - -#define STRIDE (4096) -#define CSTRIDE (4096) -#define ITER (256) - -__attribute__((reqd_work_group_size(256, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[1024]; - __local float shared_b[1024]; - __local float4* pShared_a = (__local float4*)shared_a; - __local float4* pShared_b = (__local float4*)shared_b; - - float4 sha; - float4 shb; - - float4 sum = 0.0f; - float* pSum = (float*)∑ - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE); - __global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2)); - - int offset = (grid_x << 6) % STRIDE; - - for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE) - { - for (int j = 0; j < 2; j++) - { - shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; - shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)]; - shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)]; - sum += sha * shb; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - for (int i = 0; i < 4; i++) - { - shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - float4 result; - float* pResult = (float*)&result; - - for (int i = 0; i < 4; i++) - { - pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i]; - } - - *pC = result; -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl deleted file mode 100644 index f7eb1091b..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (4096) -#define CSTRIDE (4096) -#define ITER (64) - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[4][66]; - __local float shared_b[8][66]; - __local float result[65]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); - - int offset = ((grid_x << 6)) % STRIDE; - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 4; j++) - { - shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; - } - - for (int j = 0; j < 8; j++) - { - shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; - } - - for (int k = 0; k < 32; k++) - { - sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)] * shared_b[(lid_x & 15) >> 1][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)]; - } - } - - result[lid_x] = sum; - reduce(result, lid_x); - - if (lid_x < 4) - { - float8 out; - float* pOut = (float*)&out; - - for (int i = 0; i < 8; i++) - { - pOut[i] = result[((lid_x * 8 + i) << 1)] + bias[(grid_x << 3) + i]; - } - - __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); - *pC = out; - } -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl deleted file mode 100644 index ba62863d5..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (4096) -#define CSTRIDE (4096) -#define ITER (64) -#define NOVECTOR - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - __local float shared_a[8][66]; - __local float shared_b[8][66]; - __local float result[65]; - - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __global const float* pA = (__global const float*)(a); - __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); - - int offset = ((grid_x << 6)) % STRIDE; - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 8; j++) - { - shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; - } - - for (int j = 0; j < 8; j++) - { - shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; - } - - for (int k = 0; k < 64; k++) - { - sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)]; - } - } - - result[lid_x] = sum; - if (lid_x < 8) - { - float8 out; - float* pOut = (float*)&out; - - for (int i = 0; i < 8; i++) - { - pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i]; - } - - __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); - *pC = out; - } -} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl deleted file mode 100644 index 8d9257251..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define STRIDE (4096) -#define ITER (16) - -#define OUTPUT 1000 - -__attribute__((reqd_work_group_size(256, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_b[64][65]; - __local float result[256]; - - __constant float* pA = (__constant float*)a; - __global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct - - int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 16; j++) - { - shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - result[lid_x] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid_x < 64) - { - result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3]; - barrier(CLK_LOCAL_MEM_FENCE); - result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192]; - - if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT) - { - c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)]; - } - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl deleted file mode 100644 index 7535432d2..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define ITER (16) - -#define OUTPUT 1000 - -__attribute__((reqd_work_group_size(256, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_b[64][65]; - __local float result[256]; - - __constant float* pA = (__constant float*)a; - __global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct - - int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 16; j++) - { - shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 16; k++) - { - sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - result[lid_x] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid_x < 64) - { - result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3]; - barrier(CLK_LOCAL_MEM_FENCE); - result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192]; - - if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT) - { - c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)]; - } - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl deleted file mode 100644 index 9e8c270b7..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define HSTRIDE (2048) -#define ITER (32) - -#define OUTPUT 1000 - -void reduce(__local float* buffer, int tid) -{ - if (tid < 64) - { - buffer[tid] += buffer[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) - { - buffer[tid << 3] += buffer[(tid << 3) + 4]; - } - barrier(CLK_LOCAL_MEM_FENCE); -} - -__attribute__((reqd_work_group_size(128, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_b[16][65]; - __local float result[129]; - - __constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12)); - __global const float* pB = (__global const float*)(b); - - int offset = (((grid_x & 127) << 6)) % HSTRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % HSTRIDE) - { - for (int j = 0; j < 8; j++) - { - shared_b[(lid_x >> 6 << 3) + j][(lid_x & 63)] = (offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63)] : 0.0f); //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < 8; k++) - { - sum += pA[(offset + ((lid_x & 7) << 3) + k) % HSTRIDE + (lid_x >> 6 << 11)] * shared_b[(lid_x >> 6 << 3) + ((lid_x & 63) >> 3)][((lid_x & 7) << 3) + k]; //correct - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - result[lid_x] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - reduce(result, lid_x); - - if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) - { - int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); - c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl deleted file mode 100644 index 99687a01b..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define ITER (64) - -#define OUTPUT 1000 - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_a[4][65]; - __local float shared_b[8][65]; - __local float result[65]; - - __global const float* pA = (__global const float*)(a + (grid_x >> 7 << 14)); //correct - __global const float* pB = (__global const float*)(b); //correct - - int offset = (((grid_x & 127) << 6)) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 4; j++) - { - shared_a[j][lid_x] = pA[offset + j * STRIDE + lid_x]; - } - - for (int j = 0; j < 8; j++) - { - shared_b[j][(lid_x)] = ((j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x) < OUTPUT * STRIDE ? pB[(j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x)] : 0.0f); //correct - } - - for (int k = 0; k < 32; k++) - { - sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k] * shared_b[((lid_x & 15) >> 1)][((lid_x & 1) << 5) + k]; //correct - } - } - - result[lid_x] = sum; - reduce(result, lid_x); - - if (lid_x < 32 && ((grid_x & 127) << 3) + (lid_x & 7) < OUTPUT) - { - int out_offset = ((grid_x >> 7 << 2) + (lid_x >> 3)) * OUTPUT + ((grid_x & 127) << 3) + (lid_x & 7); - c[out_offset] = bias[((grid_x & 127) << 3) + (lid_x & 7)] + result[(lid_x << 1)]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl deleted file mode 100644 index 952e71fd5..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define ITER (64) - -#define OUTPUT 1000 - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } - if (tid < 8) - { - buffer[tid << 3] += buffer[(tid << 3) + 4]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __constant float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_b[8][65]; - __local float result[64]; - - __constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12)); //correct - __global const float* pB = (__global const float*)(b); //correct - - int offset = (((grid_x & 127) << 6)) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - for (int j = 0; j < 8; j++) - { - shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct - } - - for (int k = 0; k < 8; k++) - { - sum += pA[(offset + ((lid_x & 7) << 3) + k)] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct - } - } - - result[lid_x] = sum; - - reduce(result, lid_x); - - if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) - { - int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); - c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; - } -} - diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl deleted file mode 100644 index 3e72c5054..000000000 --- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define STRIDE (4096) -#define ITER (64) - -#define OUTPUT 1000 - -void reduce(__local float* buffer, int tid) -{ - if (tid < 32) - { - buffer[tid << 1] += buffer[(tid << 1) + 1]; - } - if (tid < 16) - { - buffer[tid << 2] += buffer[(tid << 2) + 2]; - } - if (tid < 8) - { - buffer[tid << 3] += buffer[(tid << 3) + 4]; - } -} - -__attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void InnerProduct( - __global const float *a, - __global const float *b, - __global const float *bias, - __global float *c) -{ - int gid_x = get_global_id(0); - int lid_x = get_local_id(0); - int grid_x = get_group_id(0); - - __local float shared_a[65]; - __local float shared_b[8][65]; - __local float result[64]; - - __global const float* pA = (__global const float*)(a + (grid_x >> 7 << 12)); //correct - __global const float* pB = (__global const float*)(b); //correct - - int offset = (((grid_x & 127) << 6)) % STRIDE; - - float sum = 0.0f; - - for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) - { - shared_a[lid_x] = pA[offset + lid_x]; - - for (int j = 0; j < 8; j++) - { - shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct - } - - for (int k = 0; k < 8; k++) - { -#if 1 - sum = mad(shared_a[((lid_x & 7) << 3) + k], shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k], sum); //correct -#else - sum += shared_a[((lid_x & 7) << 3) + k] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct -#endif - } - } - - result[lid_x] = sum; - - reduce(result, lid_x); - - if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) - { - int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); - c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; - } -} - diff --git a/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl b/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl deleted file mode 100644 index a17634bc9..000000000 --- a/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define _FLOAT float -#define _FLOAT2 float2 -#define _FLOAT4 float4 -#define _FLOAT8 float8 -#define _INT_MASK_GLOBAL uchar -#define _INT_MASK_LOCAL uchar - -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+38F /* max value */ -#endif - -#define UNUSED __attribute__((__unused__)) - -#define MLO_POOLING_OP_MAX 0 -#define MLO_POOLING_OP_AVE 1 -#define MLO_POOLING_OP_STC 2 - -#define MLO_POOLING_GROUP_SZ2 1 - -#ifndef MLO_POOLING_OP_ID -#define MLO_POOLING_OP_ID 0 -#endif -// max -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX -#define MLO_POOLING_OP(A, B) fmax(A, B); -#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE -#define MLO_POOLING_OP(A, B) (A + B); -#endif - -/********************************************************************************* - -**********************************************************************************/ - -#define THREAD_PER_WAVE 64 -#define WAVE_PER_4SIMD 40 - -#define MLO_BOT_DATA_SZ0 2 -#define MLO_BOT_DATA_SZ1 2 - -//#define LOCAL_MEMORY - -__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void -mloPooling(const __global _FLOAT* bot, - __global _FLOAT* top, - __global _FLOAT* bias, - float slope) -{ - uint gid = get_global_id(0); - uint ob = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz - uint bot_off = 0; - uint top_off = gid; - - _FLOAT2 bot_data[MLO_BOT_DATA_SZ1]; - _FLOAT res; - -#ifdef LOCAL_MEMORY - __local _FLOAT write_combine[256]; - __local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine; - __global _FLOAT4* p_top; -#endif - - uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD)); - uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD; - - for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride) - { - uint bot_b = (top_off / MLO_POOLING_TOP_BATCH_STRIDE); - uint bot_c = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE); - uint bot_y = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1; - uint bot_x = (top_off % MLO_POOLING_TOP_STRIDE) << 1; - - bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x; -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX - res = -FLT_MAX; -#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - res = 0; -#endif - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - uint pool_size = 0; -#endif - for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j) - { - uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE; - __global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off); - bot_data[j] = *read; - bot_data[j] += bias[bot_c]; - bot_data[j].s0 *= (bot_data[j].s0 > 0.0f ? 1.0f : slope); - bot_data[j].s1 *= (bot_data[j].s1 > 0.0f ? 1.0f : slope); - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - pool_size += (uint)vis; -#endif - res = MLO_POOLING_OP(res, bot_data[j].s0); - - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - pool_size += (uint)vis; -#endif - res = MLO_POOLING_OP(res, bot_data[j].s1); - } - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - res *= 1.f / (_FLOAT)pool_size; -#endif - -#ifdef LOCAL_MEMORY - write_combine[get_local_id(0)] = res; - if (get_local_id(0) % 4 == 0) - { - p_top = (__global _FLOAT4*)(top + top_off); - *p_top = p_write_combine[get_local_id(0) / 4]; - } -#else - top[top_off] = res; -#endif - } -} diff --git a/saber/funcs/impl/amd/cl/Pooling.cl b/saber/funcs/impl/amd/cl/Pooling.cl deleted file mode 100644 index 31927ad16..000000000 --- a/saber/funcs/impl/amd/cl/Pooling.cl +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#define _FLOAT float -#define _FLOAT2 float2 -#define _FLOAT4 float4 -#define _FLOAT8 float8 -#define _INT_MASK_GLOBAL uchar -#define _INT_MASK_LOCAL uchar - -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+38F /* max value */ -#endif - -#define UNUSED __attribute__((__unused__)) - -#define MLO_POOLING_OP_MAX 0 -#define MLO_POOLING_OP_AVE 1 -#define MLO_POOLING_OP_STC 2 - -#define MLO_POOLING_GROUP_SZ2 1 - -#ifndef MLO_POOLING_OP_ID -#define MLO_POOLING_OP_ID 0 -#endif -// max -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX -#define MLO_POOLING_OP(A, B) fmax(A, B); -#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE -#define MLO_POOLING_OP(A, B) (A + B); -#endif - -/********************************************************************************* - -**********************************************************************************/ - -#define THREAD_PER_WAVE 64 -#define WAVE_PER_4SIMD 40 - -#define MLO_BOT_DATA_SZ0 2 -#define MLO_BOT_DATA_SZ1 2 - -//#define LOCAL_MEMORY - -__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void -mloPooling(const __global _FLOAT* bot, - __global _FLOAT* top ) -// __global _INT_MASK_GLOBAL* mask) -{ - uint gid = get_global_id(0); - uint ob = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz - uint bot_off = 0; - uint top_off = gid; - - _FLOAT2 bot_data[MLO_BOT_DATA_SZ1]; - _FLOAT res; - -#ifdef LOCAL_MEMORY - __local _FLOAT write_combine[256]; - __local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine; - __global _FLOAT4* p_top; -#endif - - uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD)); - uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD; - - for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride) - { - uint bot_b = (top_off / MLO_POOLING_TOP_BATCH_STRIDE); - uint bot_c = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE); - uint bot_y = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1; - uint bot_x = (top_off % MLO_POOLING_TOP_STRIDE) << 1; - - bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x; -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX - res = -FLT_MAX; -#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - res = 0; -#endif - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - uint pool_size = 0; -#endif - for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j) - { - //int run_y = (int)j; - - uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE; - __global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off); - bot_data[j] = *read; - - //int run_x = (int)bot_x; -//#if 1 -// bool vis = true; -//#else -// bool vis = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) && -// (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH)) -// ? true -// : false; -//#endif -// bot_data[j].s0 = (vis) ? bot_data[j].s0 : -//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX -// -FLT_MAX -//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE -// 0 -//#endif -// ; - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - pool_size += (uint)vis; -#endif - res = MLO_POOLING_OP(res, bot_data[j].s0); - - - //run_x++; -//#if 1 -//#else -// vis = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) && -// (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH)) -// ? true -// : false; -//#endif -// bot_data[j].s1 = (vis) ? bot_data[j].s1 : -//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX -// -FLT_MAX -//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE -// 0 -//#endif -// ; - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - pool_size += (uint)vis; -#endif - res = MLO_POOLING_OP(res, bot_data[j].s1); - } - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - res *= 1.f / (_FLOAT)pool_size; -#endif - -#ifdef LOCAL_MEMORY - write_combine[get_local_id(0)] = res; - if (get_local_id(0) % 4 == 0) - { - p_top = (__global _FLOAT4*)(top + top_off); - *p_top = p_write_combine[get_local_id(0) / 4]; - } -#else - top[top_off] = res; -#endif - } -} diff --git a/saber/funcs/impl/amd/cl/Relu.cl b/saber/funcs/impl/amd/cl/Relu.cl deleted file mode 100644 index 87f0e3e42..000000000 --- a/saber/funcs/impl/amd/cl/Relu.cl +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#define _FLOAT float -#define _FLOAT2 float2 -#define _FLOAT4 float4 -#define _FLOAT8 float8 - -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+38F /* max value */ -#endif - -#define UNUSED __attribute__((__unused__)) - -#ifndef MLO_FILTER_STRIDE0 -#define MLO_FILTER_STRIDE0 1 -#endif -#ifndef MLO_FILTER_STRIDE1 -#define MLO_FILTER_STRIDE1 1 -#endif - -#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1) -#define MLO_GRP_SZ1 1 -#define MLO_GRP_SZ2 1 -#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2) -#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0) - -#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ)) -#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS) -#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ) - -#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0) -#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1) - -#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK) -#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \ - MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS -#define MLO_OUTPUTS_ALIGNED 1 -#else -#define MLO_OUTPUTS_ALIGNED 0 -#endif - -#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS) -#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ -#define MLO_BATCH_ALIGNED 1 -#else -#define MLO_BATCH_ALIGNED 0 -#endif - -#if defined(__AMDGCN__) -extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane"); -#define uniform(x) __llvm_amdgcn_readfirstlane(x) -#else -#define uniform(x) (x) -#endif - -static inline uint iDiv(uint v, uint d) -{ - uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f); - return (r); -} - -static inline uint iMod(uint v, uint u, uint d) -{ - uint r = v - mul24((uint)u, (uint)d); - return (r); -} - -__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void -Relu(const __global _FLOAT* __restrict in, - __global _FLOAT* __restrict out, - _FLOAT slope) -{ - uint grp_id0 = get_group_id(0); -#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1) - uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0); - uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0); -#else - uint y_tile_blk = grp_id0 / MLO_N_OUT_TILE_BLOCKS0; - uint x_tile_blk = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1); -#endif - uint o_pack = get_group_id(1); // block of outputs - uint b_pack = get_group_id(2); // batch block - - uint lcl_id = get_local_id(0); -#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ - uint stack = 0; - uint alu_stack_id = lcl_id; -#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1) - uint stack = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ); // stack - uint alu_stack_id = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack -#else - uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack - uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack -#if MLO_ALUTILES_STACK_SZ >= 64 - stack = uniform(stack); -#endif -#endif -// ALU plane inside stack -#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1) - uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index - uint alu_out_id = iMod( - alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane -#else - uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ; // alu output plane index - uint alu_out_id = alu_stack_id & (MLO_ALU_TILE_SZ - 1); // alu index inside an ALU output plane -#endif -// pos inside ALU tile -#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1) - uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0); - uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0); -#else - uint alu_tl1 = alu_out_id / MLO_ALU_VTILE0; - uint alu_tl0 = alu_out_id & (MLO_ALU_VTILE0 - 1); -#endif - - uint o_map_plane = - o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack - uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset - uint o_map = o_map_plane + o_map_base; // output map index per ALU plane - uint b_index = b_pack * MLO_N_STACKS; - - uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0; - uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1; - -// write results out -#if MLO_FILTER_STRIDE0 == 1 - int x_out_grp = x_grp; -#else - int x_out_grp = x_tile_blk * MLO_IN_TILE0; -#endif -#if MLO_FILTER_STRIDE1 == 1 - int y_out_grp = y_grp; -#else - int y_out_grp = y_tile_blk * MLO_IN_TILE1; -#endif - int x_out_lcl = alu_tl0 * MLO_OUT_TILE0; - int y_out_lcl = alu_tl1 * MLO_OUT_TILE1; - - uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE + - (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl; -// over all local stacks -#if MLO_BATCH_ALIGNED == 0 - if(b_index + stack < MLO_BATCH_SZ) -#endif - { - // over all local outputs - int out_off1 = out_off; - for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE) - { -#if MLO_OUTPUTS_ALIGNED == 0 - if(o_map + o < MLO_N_OUTPUTS) -#endif - { - // over output tile - int out_off2 = out_off1; - for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE) - { - if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT) - { - for (uint i = 0; i < MLO_OUT_TILE0; ++i) - { - if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ) - { - //ReLU - //out[out_off2 + i] = max(in[out_off2 + i], 0.0f); - out[out_off2 + i] = (in[out_off2 + i] > 0) ? in[out_off2 + i] : in[out_off2 + i] * slope; - } - } - } - } - } - } - } -} diff --git a/saber/funcs/impl/amd/cl/Softmax.cl b/saber/funcs/impl/amd/cl/Softmax.cl deleted file mode 100644 index 32e61fa13..000000000 --- a/saber/funcs/impl/amd/cl/Softmax.cl +++ /dev/null @@ -1,220 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* Steps to compute softmax: - * 1. Compute the max per channel. - * 2. Subtract the max from each value in the channel. - * 3. Compute the exponent of all the values. - * 4. Compute the sum of the vales per channel. - * 5. Normalize based on the sum. - * - * We use CSR-{Vector / Stream} apprach to pick an algorithm depending on the - * number of channels each workgroup has to work with. - * J. L. Greathouse, M. Daga, Efficient sparse matrix-vector multiplication - * on GPUs using the CSR storage format, in: Proc. Int'l Conf. High Performance - * Computing, Networking, Storage and Analysis (SC'14) -*/ - -kernel void Softmax(global float* y, const int c, const int grid_size, const int spatial_dim) -{ -#if NUM_BATCH == 1 // CSR-Vector like appraoch - - /* Entire workgroup works on one spatial_dim. - * We use logarthmic reductions to compute max and sum per channel. - * This approach reads in the same data thrice from DRAM but is still better - * than launching three different kernels. - * The workgroup begins by computing the nth image and s (spatial_dim) it - * is working on and iterates over the entire grid until finished. - */ - - local float l_helper[256]; - - int gid = get_group_id(0); - int lid = get_local_id(0); - - // Total number of workgroups launched can be less than the gridsize, hence iterate over. - for(gid = get_group_id(0); gid < grid_size; gid += get_num_groups(0)) - { - - int n = gid / spatial_dim; // nth image - int s = gid % spatial_dim; // spatial dimension (h*w) - - l_helper[lid] = -FLT_MAX; - - float t_helper = -FLT_MAX; // thread_local helper var - - // Compute max per channel - // Iterate over all the channels one thread is supposed to loop over - // and compute max - for(int i = lid; i < c; i += get_local_size(0)) - { - t_helper = max(y[mad24(n, c, i) * spatial_dim + s], t_helper); - } - - // Now we have to compute the max from 256 values (one per each thread) - l_helper[lid] = t_helper; - barrier(CLK_LOCAL_MEM_FENCE); - - // Logarithmic reduction to compute the max. - for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1) - { - if(lid < i) - { - l_helper[lid] = max(l_helper[lid], l_helper[lid + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float channel_max = l_helper[0]; - t_helper = 0.; - - // Subtract channel_max from each value - for(int i = lid; i < c; i += get_local_size(0)) - { - float value = y[mad24(n, c, i) * spatial_dim + s]; - - // Compute exponent of each value - // Then sum all the values touched by this thread - t_helper += exp(value - channel_max); - } - - l_helper[lid] = t_helper; - barrier(CLK_LOCAL_MEM_FENCE); - - // Compute sum of 256 values (one for each thread) - // Logarithmic reduction to compute the sum - for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1) - { - if(lid < i) - { - l_helper[lid] += l_helper[lid + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float channel_sum = l_helper[0]; - - // Normalize each value in the channel by the channel_sum - for(int i = lid; i < c; i += get_local_size(0)) - { - float value = y[mad24(n, c, i) * spatial_dim + s]; - - // Subtracting max again because we do not write the output of - // value-max to DRAM above. Doing a subtraction again is much - // faster than writing uncoalesced to DRAM - value = exp(value - channel_max); - - y[mad24(n, c, i) * spatial_dim + s] = value / channel_sum; - } - } - -#else // CSR-Stream like approach - - /* Each workgroup is computing the softmax for NUM_BATCH spatial_dims ala CSR-Stream. - * The number of threads iterting over channels to compute softmax for one batch is BATCH_SIZE. - * The number of values each thread works on is U_BATCH_SIZE (read micro batch size). - * Each batch in the workgroup works on its nth image and s (spatial_dim). - * E.g. a 256 thread workgroup with c=31 has 8 batches and a batchsize of 32. - * The number of workgroups launched are exactly the number as required - * hence, there is no for-loop. - */ - - local float l_helper[256]; - - int gid = get_group_id(0); - int lid = get_local_id(0); - - // ID of the thread within the batch - int batch_lid = lid & (BATCH_SIZE - 1); // thread specific channel_st - int batch = lid / BATCH_SIZE; // which spatial_dim or pixel - - // Batch specific n and s - int batch_n = (NUM_BATCH * gid + batch) / spatial_dim; // nth image - int batch_s = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel - - l_helper[lid] = -FLT_MAX; - - float t_helper = -FLT_MAX; // thread_local helper var - - // stores all the values touched by one thread so that we do not have load - // again as the CSR-Vector approach - float value[U_BATCH_SIZE]; - for(int i = 0; i < U_BATCH_SIZE; i++) - { - value[i] = -FLT_MAX; - } - - // Compute max per channel - // BATCH_SIZE threads iterate over the channels - for(int i = batch_lid; i < c; i += BATCH_SIZE) - { - if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size) - value[i / BATCH_SIZE] = y[mad24(batch_n, c, i) * spatial_dim + batch_s]; - t_helper = max(value[i / BATCH_SIZE], t_helper); - } - - // Now we have to compute the max from 256 values (one per each thread) - l_helper[lid] = t_helper; - barrier(CLK_LOCAL_MEM_FENCE); - - // Logarithmic reduction to compute the max. - for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1) - { - if(batch_lid < i) - { - l_helper[lid] = max(l_helper[lid], l_helper[lid + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float channel_max = l_helper[batch * BATCH_SIZE]; - t_helper = 0.; - - // Subtract channel_max from each value - for(int i = batch_lid; i < c; i += BATCH_SIZE) - { - - // Compute exponent of each value - // Then sum all the values touched by this thread - t_helper += exp(value[i / BATCH_SIZE] - channel_max); - } - - l_helper[lid] = t_helper; - barrier(CLK_LOCAL_MEM_FENCE); - - // Compute sum of 256 values (one for each thread) - // Logarithmic reduction to compute the sum - for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1) - { - if(batch_lid < i) - { - l_helper[lid] += l_helper[lid + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float channel_sum = l_helper[batch * BATCH_SIZE]; - - // Normalize each value in the channel by the channel_sum - for(int i = batch_lid; i < c; i += BATCH_SIZE) - { - value[i / BATCH_SIZE] = exp(value[i / BATCH_SIZE] - channel_max); - - if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size) - y[mad24(batch_n, c, i) * spatial_dim + batch_s] = value[i / BATCH_SIZE] / channel_sum; - } - -#endif // CSR-Vector vs CSR-Stream -} diff --git a/saber/funcs/impl/amd/saber_activation.cpp b/saber/funcs/impl/amd/saber_activation.cpp deleted file mode 100644 index 151bc5f78..000000000 --- a/saber/funcs/impl/amd/saber_activation.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#include "saber/funcs/base.h" -#include "saber/funcs/impl/amd/saber_activation.h" -#include "saber/funcs/impl/amd/amd_utils.h" - -namespace anakin{ -namespace saber { - -typedef TargetWrapper AMD_API; - -template -SaberStatus SaberActivation::init( - const std::vector*>& inputs, - std::vector*>& outputs, - ActivationParam ¶m, - Context &ctx) { - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus SaberActivation::create( - const std::vector*>& inputs, - std::vector*>& outputs, - ActivationParam ¶m, - Context &ctx) { - - this->_ctx = &ctx; - - cl_context context = 0; - cl_device_id device = 0; - - Device dev = Env::cur_env()[inputs[0]->device_id()]; //anakin device id to AMD device - device = dev.get_device(); - context = dev.get_context(); - - //LOG(INFO) << "device id= " << device << " conext = " << context; - - KernelInfo kernelInfo; - - switch (param.active){ - case Active_relu: - //TODO - //Rewrite here once solver is ready.////////////// - T_ExtSolutionConfig extSolution; - //LOG(INFO) << inputs[0]->width() << " x " << inputs[0]->height() << " x " << inputs[0]->channel(); - switch(inputs[0]->width()) - { - case 224: - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {12544, 8, 1}; - - extSolution.in_tile0 = 32; - extSolution.in_tile1 = 32; - extSolution.grp_tile0 = 16; - extSolution.grp_tile1 = 16; - extSolution.out_pix_tile0 = 2; - extSolution.out_pix_tile1 = 2; - extSolution.n_stacks = 1; - extSolution.n_out_pix_tiles = 8; - extSolution.n_out_tiles_perstack = 8; - extSolution.n_in_data_tiles = 2; - extSolution.n_read_procs = 256; - extSolution.alu_tile0 = 16; - extSolution.alu_tile1 = 16; - break; - - case 112: - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {4096, 16, 1}; - - extSolution.in_tile0 = 32; - extSolution.in_tile1 = 32; - extSolution.grp_tile0 = 16; - extSolution.grp_tile1 = 16; - extSolution.out_pix_tile0 = 2; - extSolution.out_pix_tile1 = 2; - extSolution.n_stacks = 1; - extSolution.n_out_pix_tiles = 8; - extSolution.n_out_tiles_perstack = 8; - extSolution.n_in_data_tiles = 2; - extSolution.n_read_procs = 256; - extSolution.alu_tile0 = 16; - extSolution.alu_tile1 = 16; - break; - - case 56: - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {1024, 32, 1}; - - extSolution.in_tile0 = 32; - extSolution.in_tile1 = 32; - extSolution.grp_tile0 = 16; - extSolution.grp_tile1 = 16; - extSolution.out_pix_tile0 = 2; - extSolution.out_pix_tile1 = 2; - extSolution.n_stacks = 1; - extSolution.n_out_pix_tiles = 8; - extSolution.n_out_tiles_perstack = 8; - extSolution.n_in_data_tiles = 2; - extSolution.n_read_procs = 256; - extSolution.alu_tile0 = 16; - extSolution.alu_tile1 = 16; - break; - - case 28: - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {256, 64, 1}; - - extSolution.in_tile0 = 32; - extSolution.in_tile1 = 32; - extSolution.grp_tile0 = 16; - extSolution.grp_tile1 = 16; - extSolution.out_pix_tile0 = 2; - extSolution.out_pix_tile1 = 2; - extSolution.n_stacks = 1; - extSolution.n_out_pix_tiles = 8; - extSolution.n_out_tiles_perstack = 8; - extSolution.n_in_data_tiles = 2; - extSolution.n_read_procs = 256; - extSolution.alu_tile0 = 16; - extSolution.alu_tile1 = 16; - break; - case 14: - kernelInfo.l_wk = {64, 1, 1}; - kernelInfo.g_wk = {64, 64, 1}; - - extSolution.in_tile0 = 16; - extSolution.in_tile1 = 16; - extSolution.grp_tile0 = 8; - extSolution.grp_tile1 = 8; - extSolution.out_pix_tile0 = 2; - extSolution.out_pix_tile1 = 2; - extSolution.n_stacks = 1; - extSolution.n_out_pix_tiles = 8; - extSolution.n_out_tiles_perstack = 8; - extSolution.n_in_data_tiles = 2; - extSolution.n_read_procs = 64; - extSolution.alu_tile0 = 8; - extSolution.alu_tile1 = 8; - break; - case 1: - if (inputs[0]->channel() == 4096) { - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {4096, 1, 1}; - }else if(inputs[0]->channel() == 1000) { - kernelInfo.l_wk = {256, 1, 1}; - kernelInfo.g_wk = {1024, 1, 1}; - } - break; - } - - kernelInfo.comp_options = - std::string(" -DMLO_HW_WAVE_SZ=64") + // (fixed) wave=64 - std::string(" -DMLO_DIR_FORWARD=1") + // (fixed) forward - std::string(" -DMLO_FILTER_STRIDE0=1") + // (fixed temp) - std::string(" -DMLO_FILTER_STRIDE1=1") + // (fixed temp) - std::string(" -DMLO_N_OUTPUTS=") + std::to_string(outputs[0]->channel()) + - std::string(" -DMLO_N_INPUTS=") + std::to_string(inputs[0]->channel()) + - std::string(" -DMLO_BATCH_SZ=") + std::to_string(inputs[0]->num()) + - std::string(" -DMLO_OUT_WIDTH=") + std::to_string(outputs[0]->width()) + - std::string(" -DMLO_OUT_HEIGHT=") + std::to_string(outputs[0]->height()) + - std::string(" -DMLO_OUT_BATCH_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height() * outputs[0]->channel()) + - std::string(" -DMLO_OUT_CHANNEL_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height()) + - std::string(" -DMLO_OUT_STRIDE=") + std::to_string(outputs[0]->width()) + - std::string(" -DMLO_IN_WIDTH=") + std::to_string(inputs[0]->width()) + - std::string(" -DMLO_IN_HEIGHT=") + std::to_string(inputs[0]->height()) + - std::string(" -DMLO_IN_BATCH_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height() * inputs[0]->channel()) + - std::string(" -DMLO_IN_CHANNEL_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height()) + - std::string(" -DMLO_IN_STRIDE=") + std::to_string(inputs[0]->width()) + - std::string(" -DMLO_CONV_BIAS=0") ; // (for now not support) - - if(inputs[0]->width() == 1) { - - kernelInfo.kernel_file = "ReluUni.cl"; - kernelInfo.kernel_name = "ReluUni"; - - } else { - //set comp_options... - - kernelInfo.comp_options += - std::string(" -DMLO_IN_TILE0=") + std::to_string(extSolution.in_tile0) + - std::string(" -DMLO_IN_TILE1=") + std::to_string(extSolution.in_tile1) + - std::string(" -DMLO_GRP_TILE0=") + std::to_string(extSolution.grp_tile0) + - std::string(" -DMLO_GRP_TILE1=") + std::to_string(extSolution.grp_tile1) + - std::string(" -DMLO_OUT_TILE0=") + std::to_string(extSolution.out_pix_tile0) + - std::string(" -DMLO_OUT_TILE1=") + std::to_string(extSolution.out_pix_tile1) + - std::string(" -DMLO_N_STACKS=") + std::to_string(extSolution.n_stacks) + - std::string(" -DMLO_N_OUT_TILES=") + std::to_string(extSolution.n_out_pix_tiles) + - std::string(" -DMLO_N_OUT_TILES_PERSTACK=") + std::to_string(extSolution.n_out_tiles_perstack) + - std::string(" -DMLO_N_IN_TILES_PERSTACK=") + std::to_string(extSolution.n_in_data_tiles) + - std::string(" -DMLO_N_READ_PROCS=") + std::to_string(extSolution.n_read_procs) + - std::string(" -DMLO_ALU_VTILE0=") + std::to_string(extSolution.alu_tile0) + - std::string(" -DMLO_ALU_VTILE1=") + std::to_string(extSolution.alu_tile1); - - kernelInfo.kernel_file = "Relu.cl"; - kernelInfo.kernel_name = "Relu"; - } - break; - - case Active_sigmoid: - - break; - - case Active_tanh: - - break; - - case Active_clipped_relu: - - break; - - case Active_elu: - - break; - } - std::copy(kernelInfo.g_wk.begin(), kernelInfo.g_wk.end(), _globalWorkSize); - std::copy(kernelInfo.l_wk.begin(), kernelInfo.l_wk.end(), _localWorkSize); - - //LOG(INFO) << "kernel file name: " << kernelInfo.kernel_file; - //LOG(INFO) << "kernel name: " << kernelInfo.kernel_name; - //LOG(INFO) << "local work size: " << kernelInfo.l_wk[0] << " " << kernelInfo.l_wk[1] << " " << kernelInfo.l_wk[2]; - //LOG(INFO) << "global work size: " << kernelInfo.g_wk[0] << " " << kernelInfo.g_wk[1] << " " << kernelInfo.g_wk[2]; - //LOG(INFO) << "compile option: " << kernelInfo.comp_options; - - //To create the program - cl_program program = CreateCLProgram(context, device, kernelInfo.kernel_file.c_str(), &kernelInfo); - if (program == NULL) - { - LOG(ERROR) << "Failed to load program"; - return SaberInvalidValue; - } - - //LOG(INFO) << "COMPILE OCL KERNEL CODE"; - - //To create kernel - _kernel = clCreateKernel(program, kernelInfo.kernel_name.c_str(), NULL); - if (_kernel == NULL) - { - LOG(ERROR) << "Failed to create kernel"; - return SaberInvalidValue; - } - - //LOG(INFO) << "COMPLETE CREATE KERNEL"; - - return SaberSuccess; -} - -template -SaberStatus SaberActivation::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ActivationParam ¶m) { - - cl_int errNum = 0; - //To get the commpute command queue - AMD_API::stream_t cm = this->_ctx->get_compute_stream(); - - //To set the argument - cl_mem memObjects[2] = { 0, 0 }; - memObjects[0] = (cl_mem)inputs[0]->data(); - memObjects[1] = (cl_mem)outputs[0]->mutable_data(); - - errNum = clSetKernelArg(_kernel, 0, sizeof(cl_mem), &memObjects[0]); - errNum |= clSetKernelArg(_kernel, 1, sizeof(cl_mem), &memObjects[1]); - errNum |= clSetKernelArg(_kernel, 2, sizeof(float), ¶m.negative_slope); - if (errNum != CL_SUCCESS) - { - LOG(ERROR) << "Fail to set kernel arguments"; - return SaberInvalidValue; - } - cl_event event; - //LOG(INFO) << "COMPLETE SET ARGUMENT"; - errNum = clEnqueueNDRangeKernel(cm, _kernel, 3, NULL, - _globalWorkSize, _localWorkSize, - 0, NULL, &event); - if (errNum != CL_SUCCESS) - { - LOG(ERROR) << "Fail to set execution: " << errNum; - return SaberInvalidValue; - } - //LOG(INFO) << "COMPLETE EXECUTION"; - cl_event_list list; - list.push_back(event); - Env::add_event(list); - - return SaberSuccess; -} - -template class SaberActivation; -DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_INT8); -DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_HALF); - -} -} // namespace anakin diff --git a/saber/funcs/impl/amd/saber_activation.h b/saber/funcs/impl/amd/saber_activation.h deleted file mode 100644 index fb569ba62..000000000 --- a/saber/funcs/impl/amd/saber_activation.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H -#define ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H - -#include "saber/funcs/impl/impl_activation.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberActivation : - public ImplBase< - AMD, OpDtype, - ActivationParam > { -public: - typedef typename DataTrait::Dtype OpDataType; - SaberActivation() = default; - ~SaberActivation() {} - - virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - ActivationParam& param, Context& ctx) override; - - virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - ActivationParam& param, Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - ActivationParam& param) override; - - private: - cl_kernel _kernel; - size_t _globalWorkSize[3]; - size_t _localWorkSize[3]; -}; - -} - -} -#endif //ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H \ No newline at end of file diff --git a/saber/funcs/impl/amd/saber_softmax.cpp b/saber/funcs/impl/amd/saber_softmax.cpp deleted file mode 100644 index b96a8f8a4..000000000 --- a/saber/funcs/impl/amd/saber_softmax.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#include "saber/funcs/base.h" -#include "saber/funcs/impl/amd/saber_softmax.h" -#include "saber/funcs/impl/amd/amd_utils.h" - -namespace anakin{ -namespace saber { - -typedef TargetWrapper AMD_API; - -template -SaberStatus SaberSoftmax::init( - const std::vector*>& inputs, - std::vector*>& outputs, - SoftmaxParam ¶m, - Context &ctx) { - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); -} - - -int nextPow2(int v) -{ - - if(v == 1) - { - return (v << 1); - } - else - { - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; - } -} - - -template -SaberStatus SaberSoftmax::create( - const std::vector*>& inputs, - std::vector*>& outputs, - SoftmaxParam ¶m, - Context &ctx) { - - cl_context context = 0; - cl_device_id device = 0; - - Device dev = Env::cur_env()[inputs[0]->device_id()]; //anakin device id to AMD device - device = dev.get_device(); - context = dev.get_context(); - - //LOG(INFO) << "device id= " << device << " conext = " << context; - - //To create vld, gld and compile options - KernelInfo kernerlInfo; - - //To set local work size - kernerlInfo.l_wk = {256, 1, 1}; - - //int num_batch = inputs[0]->num(); - int c = inputs[0]->channel(); - int num_batch = c; - - //To set comp_options - kernerlInfo.comp_options = std::string(" -DNUM_BATCH=") + std::to_string(num_batch); - - int grid_size = inputs[0]->num() * inputs[0]->width() * inputs[0]->height(); - - if(num_batch == 1) - { // CSR-Vector like approach - - // Control the max. number of workgroups launched so that we do not - // start getting workgroup scheduling overheads - size_t workgroups = std::min(grid_size, 64 * 40 * 8); - kernerlInfo.g_wk = {workgroups * kernerlInfo.l_wk[0], 1, 1}; - } - else - { // CSR-Stream like approach - - // num_threads iterating over channels for one spatial_dim - int batch_size = 256 / num_batch; - // num_channels each threads iterates over to cover all the channels - int u_batch_size = c > batch_size ? nextPow2(c / batch_size) : 1; - - size_t workgroups = - grid_size % num_batch == 0 ? grid_size / num_batch : grid_size / num_batch + 1; - kernerlInfo.g_wk = {workgroups * kernerlInfo.l_wk[0], 1, 1}; - - kernerlInfo.comp_options += " -DBATCH_SIZE=" + std::to_string(batch_size) + " -DU_BATCH_SIZE=" + - std::to_string(u_batch_size); - } - - kernerlInfo.kernel_file = "Softmax.cl"; - kernerlInfo.kernel_name = "Softmax"; - - //LOG(INFO) << "kernel file name: " << kernerlInfo.kernel_file; - //LOG(INFO) << "kernel name: " << kernerlInfo.kernel_name; - //LOG(INFO) << "local work size: " << kernerlInfo.l_wk[0] << " " << kernerlInfo.l_wk[1] << " " << kernerlInfo.l_wk[2]; - //LOG(INFO) << "global work size: " << kernerlInfo.g_wk[0] << " " << kernerlInfo.g_wk[1] << " " << kernerlInfo.g_wk[2]; - //LOG(INFO) << "compile option: " << kernerlInfo.comp_options; - - std::copy(kernerlInfo.g_wk.begin(), kernerlInfo.g_wk.end(), _globalWorkSize); - std::copy(kernerlInfo.l_wk.begin(), kernerlInfo.l_wk.end(), _localWorkSize); - - std::string kernel_file = kernerlInfo.kernel_file; - std::string kernel_name = kernerlInfo.kernel_name; - - //To create the program - cl_program program = CreateCLProgram(context, device, kernerlInfo.kernel_file.c_str(), &kernerlInfo); - if (program == NULL) - { - LOG(ERROR) << "Failed to load program"; - return SaberInvalidValue; - } - - //LOG(INFO) << "COMPILE OCL KERNEL CODE"; - - //To create kernel - _kernel = clCreateKernel(program, kernerlInfo.kernel_name.c_str(), NULL); - if (_kernel == NULL) - { - LOG(ERROR) << "Failed to create kernel"; - return SaberInvalidValue; - } - - //LOG(INFO) << "COMPLETE CREATE KERNEL"; - this->_ctx = &ctx; - return SaberSuccess; -} - -template -SaberStatus SaberSoftmax::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - SoftmaxParam ¶m) { - - //LOG(INFO) << "SaberSoftmax::dispatch"; - cl_int errNum = 0; - - //To set the argument - cl_mem memObjects[1] = { 0 }; - outputs[0]->copy_from(*inputs[0]); - memObjects[0] = (cl_mem)outputs[0]->data(); - int arg1_channel = inputs[0]->channel(); - int arg2_grid_size = inputs[0]->num() * inputs[0]->width() * inputs[0]->height(); - int arg3_stride_c = inputs[0]->width() * inputs[0]->height(); - - errNum = clSetKernelArg(_kernel, 0, sizeof(cl_mem), &memObjects[0]); - errNum |= clSetKernelArg(_kernel, 1, sizeof(int), &arg1_channel); - errNum |= clSetKernelArg(_kernel, 2, sizeof(int), &arg2_grid_size); - errNum |= clSetKernelArg(_kernel, 3, sizeof(int), &arg3_stride_c); - - if (errNum != CL_SUCCESS) - { - LOG(ERROR) << "Fail to set kernel arguments: " << errNum; - return SaberInvalidValue; - } - - //LOG(INFO) << "COMPLETE SET ARGUMENT"; - - //To get the commpute command queue - AMD_API::stream_t cm = this->_ctx->get_compute_stream(); - - cl_event event; - errNum = clEnqueueNDRangeKernel(cm, _kernel, 3, NULL, - _globalWorkSize, _localWorkSize, - 0, NULL, &event); - if (errNum != CL_SUCCESS) - { - LOG(ERROR) << "Fail to set execution: " << errNum; - return SaberInvalidValue; - } - //LOG(INFO) << "COMPLETE EXECUTION"; - - cl_event_list list; - list.push_back(event); - Env::add_event(list); - return SaberSuccess; -} - -template class SaberSoftmax; -DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, AMD, AK_INT8); -DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, AMD, AK_HALF); - -} -} // namespace anakin diff --git a/saber/funcs/impl/amd/saber_softmax.h b/saber/funcs/impl/amd/saber_softmax.h deleted file mode 100644 index aea4ca81f..000000000 --- a/saber/funcs/impl/amd/saber_softmax.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H -#define ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H - -#include "saber/funcs/impl/impl_softmax.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberSoftmax : - public ImplBase< - AMD, OpDtype, - SoftmaxParam > { -public: - typedef typename DataTrait::Dtype OpDataType; - SaberSoftmax() = default; - ~SaberSoftmax() {} - - virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - SoftmaxParam& param, Context& ctx) override; - - virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - SoftmaxParam& param, Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - SoftmaxParam& param) override; - - private: - cl_kernel _kernel; - size_t _globalWorkSize[3]; - size_t _localWorkSize[3]; -}; - -} - -} -#endif //ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H \ No newline at end of file diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h index 9c8354588..30ab88614 100644 --- a/saber/funcs/softmax.h +++ b/saber/funcs/softmax.h @@ -24,7 +24,7 @@ #include "saber/funcs/impl/cuda/vender_softmax.h" #endif #ifdef AMD_GPU -#include "saber/funcs/impl/amd/saber_softmax.h" +//#include "saber/funcs/impl/amd/saber_softmax.h" #endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_softmax.h" diff --git a/test/framework/net/net_exec_map_rnn.cpp b/test/framework/net/net_exec_map_rnn.cpp index 896d02fdb..567642fff 100644 --- a/test/framework/net/net_exec_map_rnn.cpp +++ b/test/framework/net/net_exec_map_rnn.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include #include "net_test.h" #include "saber/funcs/timer.h" @@ -17,14 +31,15 @@ #if defined(NVIDIA_GPU) using Target = NV; using Target_H = NVHX86; -#else if defined(USE_X86_PLACE) +#elif defined(USE_X86_PLACE) using Target = X86; using Target_H = X86; #include "mkl_service.h" +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = AMDHX86; #endif - - std::string FLAGS_data_file; std::string FLAGS_model_file; int FLAGS_num = 1; @@ -270,7 +285,7 @@ void one_thread_run(std::string path, int thread_id) { h_tensor_in_1->reshape(Shape({week_fea.size(), 10, 1, 1})); h_tensor_in_2->reshape(Shape({time_fea.size(), 10, 1, 1})); h_tensor_in_0->set_seq_offset({seq_offset}); -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(AMD_GPU) Tensor4d h_tensor_0; Tensor4d h_tensor_1; Tensor4d h_tensor_2; @@ -314,9 +329,13 @@ void one_thread_run(std::string path, int thread_id) { #endif net_executer.prediction(); -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(AMD_GPU) +#ifdef AMD_GPU + clFinish(ctx.get_compute_stream()); +#else cudaDeviceSynchronize(); - auto dev_out=net_executer.get_out("final_output.tmp_1_gout"); +#endif + auto dev_out = net_executer.get_out("final_output.tmp_1_gout"); Tensor out(dev_out->valid_shape()); out.copy_from(*dev_out); int size = out.valid_size(); @@ -331,8 +350,9 @@ void one_thread_run(std::string path, int thread_id) { printf("%f\n", static_cast(out.data())[seq_start + seq_len - 1]); } + #else - auto out =net_executer.get_out("final_output.tmp_1_gout"); + auto out = net_executer.get_out("final_output.tmp_1_gout"); int size = out->valid_size(); for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) { @@ -345,6 +365,7 @@ void one_thread_run(std::string path, int thread_id) { printf("%f\n", static_cast(out->data())[seq_start + seq_len - 1]); } + #endif @@ -432,4 +453,3 @@ int main(int argc, const char** argv) { RUN_ALL_TESTS(argv[0]); return 0; } - diff --git a/tools/amd_gpu_build.sh b/tools/amd_gpu_build.sh index 8ca17f36e..60ef4a2d1 100755 --- a/tools/amd_gpu_build.sh +++ b/tools/amd_gpu_build.sh @@ -1,9 +1,25 @@ +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. #!/bin/bash # This script shows how one can build a anakin for the gpu platform ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" echo "-- Anakin root dir is: $ANAKIN_ROOT" +echo "delete cache files" +rm -rf ~/.cache/amd_saber/ + # build the target into gpu_build. BUILD_ROOT=$ANAKIN_ROOT/amd_gpu_build @@ -29,6 +45,7 @@ cmake \ -DUSE_PYTHON=OFF \ -DENABLE_DEBUG=YES \ -DENABLE_VERBOSE_MSG=NO \ + -DENABLE_AMD_PROFILING=NO \ -DDISABLE_ALL_WARNINGS=YES \ -DENABLE_NOISY_WARNINGS=NO \ -DUSE_OPENMP=NO\